fix: breakpad use miniz
Some checks failed
sm-rpc / build (Debug, arm-linux-gnueabihf) (push) Successful in 1m34s
sm-rpc / build (Debug, aarch64-linux-gnu) (push) Successful in 2m46s
sm-rpc / build (Debug, host.gcc) (push) Failing after 1m28s
sm-rpc / build (Release, aarch64-linux-gnu) (push) Successful in 2m14s
sm-rpc / build (Release, arm-linux-gnueabihf) (push) Successful in 2m8s
sm-rpc / build (Debug, mipsel-linux-gnu) (push) Successful in 5m35s
sm-rpc / build (Release, host.gcc) (push) Failing after 1m55s
sm-rpc / build (Release, mipsel-linux-gnu) (push) Successful in 7m21s

This commit is contained in:
tqcq
2025-08-25 15:24:22 +08:00
parent a58517497b
commit 68b2e7f763
728 changed files with 489652 additions and 1211 deletions

View File

@@ -0,0 +1,33 @@
#if defined(__APPLE__) && !defined(__METAL_VERSION__)
#include <TargetConditionals.h>
#endif
#define kBackbufferWidth 1280
#define kBackbufferHeight 720
#if defined(__EMSCRIPTEN__)
#define CPU_CAN_DO_SIMD 0
#define CPU_CAN_DO_THREADS 0
#else
#define CPU_CAN_DO_SIMD 1
#define CPU_CAN_DO_THREADS 1
#endif
#define DO_SAMPLES_PER_PIXEL 4
#define DO_ANIMATE_SMOOTHING 0.9f
#define DO_LIGHT_SAMPLING 1
#define DO_MITSUBA_COMPARE 0
// Should path tracing be done on the GPU with a compute shader?
#define DO_COMPUTE_GPU 0
#define kCSGroupSizeX 8
#define kCSGroupSizeY 8
#define kCSMaxObjects 64
// Should float3 struct use SSE/NEON?
#define DO_FLOAT3_WITH_SIMD (!(DO_COMPUTE_GPU) && CPU_CAN_DO_SIMD && 1)
// Should HitSpheres function use SSE/NEON?
#define DO_HIT_SPHERES_SIMD (CPU_CAN_DO_SIMD && 1)

View File

@@ -0,0 +1,192 @@
#pragma once
#if defined(_MSC_VER)
#define VM_INLINE __forceinline
#else
#define VM_INLINE __attribute__((unused, always_inline, nodebug)) inline
#endif
#define kSimdWidth 4
#if !defined(__arm__) && !defined(__arm64__) && !defined(__EMSCRIPTEN__)
// ---- SSE implementation
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#define SHUFFLE4(V, X,Y,Z,W) float4(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(W,Z,Y,X)))
struct float4
{
VM_INLINE float4() {}
VM_INLINE explicit float4(const float *p) { m = _mm_loadu_ps(p); }
VM_INLINE explicit float4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
VM_INLINE explicit float4(float v) { m = _mm_set_ps1(v); }
VM_INLINE explicit float4(__m128 v) { m = v; }
VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }
VM_INLINE float getW() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 3, 3, 3))); }
__m128 m;
};
typedef float4 bool4;
VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
VM_INLINE float4 operator- (float4 a, float4 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
VM_INLINE float4 operator* (float4 a, float4 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = _mm_and_ps(a.m, b.m); return a; }
VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = _mm_or_ps(a.m, b.m); return a; }
VM_INLINE float4 operator- (float4 a) { a.m = _mm_xor_ps(a.m, _mm_set1_ps(-0.0f)); return a; }
VM_INLINE float4 min(float4 a, float4 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
VM_INLINE float4 max(float4 a, float4 b) { a.m = _mm_max_ps(a.m, b.m); return a; }
VM_INLINE float hmin(float4 v)
{
v = min(v, SHUFFLE4(v, 2, 3, 0, 0));
v = min(v, SHUFFLE4(v, 1, 0, 0, 0));
return v.getX();
}
// Returns a 4-bit code where bit0..bit3 is X..W
VM_INLINE unsigned mask(float4 v) { return _mm_movemask_ps(v.m); }
// Once we have a comparison, we can branch based on its results:
VM_INLINE bool any(bool4 v) { return mask(v) != 0; }
VM_INLINE bool all(bool4 v) { return mask(v) == 15; }
// "select", i.e. hibit(cond) ? b : a
// on SSE4.1 and up this can be done easily via "blend" instruction;
// on older SSEs has to do a bunch of hoops, see
// https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
VM_INLINE float4 select(float4 a, float4 b, bool4 cond)
{
#if defined(__SSE4_1__) || defined(_MSC_VER) // on windows assume we always have SSE4.1
a.m = _mm_blendv_ps(a.m, b.m, cond.m);
#else
__m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
a.m = _mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m));
#endif
return a;
}
VM_INLINE __m128i select(__m128i a, __m128i b, bool4 cond)
{
#if defined(__SSE4_1__) || defined(_MSC_VER) // on windows assume we always have SSE4.1
return _mm_blendv_epi8(a, b, _mm_castps_si128(cond.m));
#else
__m128i d = _mm_srai_epi32(_mm_castps_si128(cond.m), 31);
return _mm_or_si128(_mm_and_si128(d, b), _mm_andnot_si128(d, a));
#endif
}
VM_INLINE float4 sqrtf(float4 v) { return float4(_mm_sqrt_ps(v.m)); }
#elif !defined(__EMSCRIPTEN__)
// ---- NEON implementation
#define USE_NEON 1
#include <arm_neon.h>
struct float4
{
VM_INLINE float4() {}
VM_INLINE explicit float4(const float *p) { m = vld1q_f32(p); }
VM_INLINE explicit float4(float x, float y, float z, float w) { float v[4] = {x, y, z, w}; m = vld1q_f32(v); }
VM_INLINE explicit float4(float v) { m = vdupq_n_f32(v); }
VM_INLINE explicit float4(float32x4_t v) { m = v; }
VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
VM_INLINE float getW() const { return vgetq_lane_f32(m, 3); }
float32x4_t m;
};
typedef float4 bool4;
VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = vaddq_f32(a.m, b.m); return a; }
VM_INLINE float4 operator- (float4 a, float4 b) { a.m = vsubq_f32(a.m, b.m); return a; }
VM_INLINE float4 operator* (float4 a, float4 b) { a.m = vmulq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = vceqq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = vcltq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = vcleq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = vandq_u32(a.m, b.m); return a; }
VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = vorrq_u32(a.m, b.m); return a; }
VM_INLINE float4 operator- (float4 a) { a.m = vnegq_f32(a.m); return a; }
VM_INLINE float4 min(float4 a, float4 b) { a.m = vminq_f32(a.m, b.m); return a; }
VM_INLINE float4 max(float4 a, float4 b) { a.m = vmaxq_f32(a.m, b.m); return a; }
VM_INLINE float hmin(float4 v)
{
float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
return vget_lane_f32(minOfMinOfHalfs, 0);
}
// Returns a 4-bit code where bit0..bit3 is X..W
VM_INLINE unsigned mask(float4 v)
{
static const uint32x4_t movemask = { 1, 2, 4, 8 };
static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
uint32x4_t t1 = vtstq_u32(t0, highbit);
uint32x4_t t2 = vandq_u32(t1, movemask);
uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
}
// Once we have a comparison, we can branch based on its results:
VM_INLINE bool any(bool4 v) { return mask(v) != 0; }
VM_INLINE bool all(bool4 v) { return mask(v) == 15; }
// "select", i.e. hibit(cond) ? b : a
// on SSE4.1 and up this can be done easily via "blend" instruction;
// on older SSEs has to do a bunch of hoops, see
// https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
VM_INLINE float4 select(float4 a, float4 b, bool4 cond)
{
a.m = vbslq_f32(cond.m, b.m, a.m);
return a;
}
VM_INLINE int32x4_t select(int32x4_t a, int32x4_t b, bool4 cond)
{
return vbslq_f32(cond.m, b, a);
}
VM_INLINE float4 sqrtf(float4 v)
{
float32x4_t V = v.m;
float32x4_t S0 = vrsqrteq_f32(V);
float32x4_t P0 = vmulq_f32( V, S0 );
float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
float32x4_t S1 = vmulq_f32( S0, R0 );
float32x4_t P1 = vmulq_f32( V, S1 );
float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
float32x4_t S2 = vmulq_f32( S1, R1 );
float32x4_t P2 = vmulq_f32( V, S2 );
float32x4_t R2 = vrsqrtsq_f32( P2, S2 );
float32x4_t S3 = vmulq_f32( S2, R2 );
return float4(vmulq_f32(V, S3));
}
VM_INLINE float4 splatX(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 0)); }
VM_INLINE float4 splatY(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 1)); }
VM_INLINE float4 splatZ(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 0)); }
VM_INLINE float4 splatW(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 1)); }
#endif

View File

@@ -0,0 +1,203 @@
#include "Maths.h"
#include <stdlib.h>
#include <stdint.h>
static uint32_t XorShift32(uint32_t& state)
{
uint32_t x = state;
x ^= x << 13;
x ^= x >> 17;
x ^= x << 15;
state = x;
return x;
}
float RandomFloat01(uint32_t& state)
{
return (XorShift32(state) & 0xFFFFFF) / 16777216.0f;
}
float3 RandomInUnitDisk(uint32_t& state)
{
float3 p;
do
{
p = 2.0 * float3(RandomFloat01(state),RandomFloat01(state),0) - float3(1,1,0);
} while (dot(p,p) >= 1.0);
return p;
}
float3 RandomInUnitSphere(uint32_t& state)
{
float3 p;
do {
p = 2.0*float3(RandomFloat01(state),RandomFloat01(state),RandomFloat01(state)) - float3(1,1,1);
} while (sqLength(p) >= 1.0);
return p;
}
float3 RandomUnitVector(uint32_t& state)
{
float z = RandomFloat01(state) * 2.0f - 1.0f;
float a = RandomFloat01(state) * 2.0f * kPI;
float r = sqrtf(1.0f - z * z);
float x = r * cosf(a);
float y = r * sinf(a);
return float3(x, y, z);
}
int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit)
{
#if DO_HIT_SPHERES_SIMD
float4 hitT = float4(tMax);
#if USE_NEON
int32x4_t id = vdupq_n_s32(-1);
#else
__m128i id = _mm_set1_epi32(-1);
#endif
#if DO_FLOAT3_WITH_SIMD && !USE_NEON
float4 rOrigX = SHUFFLE4(r.orig, 0, 0, 0, 0);
float4 rOrigY = SHUFFLE4(r.orig, 1, 1, 1, 1);
float4 rOrigZ = SHUFFLE4(r.orig, 2, 2, 2, 2);
float4 rDirX = SHUFFLE4(r.dir, 0, 0, 0, 0);
float4 rDirY = SHUFFLE4(r.dir, 1, 1, 1, 1);
float4 rDirZ = SHUFFLE4(r.dir, 2, 2, 2, 2);
#elif DO_FLOAT3_WITH_SIMD
float4 rOrigX = splatX(r.orig.m);
float4 rOrigY = splatY(r.orig.m);
float4 rOrigZ = splatZ(r.orig.m);
float4 rDirX = splatX(r.dir.m);
float4 rDirY = splatY(r.dir.m);
float4 rDirZ = splatZ(r.dir.m);
#else
float4 rOrigX = float4(r.orig.x);
float4 rOrigY = float4(r.orig.y);
float4 rOrigZ = float4(r.orig.z);
float4 rDirX = float4(r.dir.x);
float4 rDirY = float4(r.dir.y);
float4 rDirZ = float4(r.dir.z);
#endif
float4 tMin4 = float4(tMin);
#if USE_NEON
int32x4_t curId = vcombine_u32(vcreate_u32(0ULL | (1ULL<<32)), vcreate_u32(2ULL | (3ULL<<32)));
#else
__m128i curId = _mm_set_epi32(3, 2, 1, 0);
#endif
// process 4 spheres at once
for (int i = 0; i < spheres.simdCount; i += kSimdWidth)
{
// load data for 4 spheres
float4 sCenterX = float4(spheres.centerX + i);
float4 sCenterY = float4(spheres.centerY + i);
float4 sCenterZ = float4(spheres.centerZ + i);
float4 sSqRadius = float4(spheres.sqRadius + i);
// note: we flip this vector and calculate -b (nb) since that happens to be slightly preferable computationally
float4 coX = sCenterX - rOrigX;
float4 coY = sCenterY - rOrigY;
float4 coZ = sCenterZ - rOrigZ;
float4 nb = coX * rDirX + coY * rDirY + coZ * rDirZ;
float4 c = coX * coX + coY * coY + coZ * coZ - sSqRadius;
float4 discr = nb * nb - c;
bool4 discrPos = discr > float4(0.0f);
// if ray hits any of the 4 spheres
if (any(discrPos))
{
float4 discrSq = sqrtf(discr);
// ray could hit spheres at t0 & t1
float4 t0 = nb - discrSq;
float4 t1 = nb + discrSq;
float4 t = select(t1, t0, t0 > tMin4); // if t0 is above min, take it (since it's the earlier hit); else try t1.
bool4 msk = discrPos & (t > tMin4) & (t < hitT);
// if hit, take it
id = select(id, curId, msk);
hitT = select(hitT, t, msk);
}
#if USE_NEON
curId = vaddq_s32(curId, vdupq_n_s32(kSimdWidth));
#else
curId = _mm_add_epi32(curId, _mm_set1_epi32(kSimdWidth));
#endif
}
// now we have up to 4 hits, find and return closest one
float minT = hmin(hitT);
if (minT < tMax) // any actual hits?
{
int minMask = mask(hitT == float4(minT));
if (minMask != 0)
{
int id_scalar[4];
float hitT_scalar[4];
#if USE_NEON
vst1q_s32(id_scalar, id);
vst1q_f32(hitT_scalar, hitT.m);
#else
_mm_storeu_si128((__m128i *)id_scalar, id);
_mm_storeu_ps(hitT_scalar, hitT.m);
#endif
// In general, you would do this with a bit scan (first set/trailing zero count).
// But who cares, it's only 16 options.
static const int laneId[16] =
{
0, 0, 1, 0, // 00xx
2, 0, 1, 0, // 01xx
3, 0, 1, 0, // 10xx
2, 0, 1, 0, // 11xx
};
int lane = laneId[minMask];
int hitId = id_scalar[lane];
float finalHitT = hitT_scalar[lane];
outHit.pos = r.pointAt(finalHitT);
outHit.normal = (outHit.pos - float3(spheres.centerX[hitId], spheres.centerY[hitId], spheres.centerZ[hitId])) * spheres.invRadius[hitId];
outHit.t = finalHitT;
return hitId;
}
}
return -1;
#else // #if DO_HIT_SPHERES_SIMD
float hitT = tMax;
int id = -1;
for (int i = 0; i < spheres.count; ++i)
{
float coX = spheres.centerX[i] - r.orig.getX();
float coY = spheres.centerY[i] - r.orig.getY();
float coZ = spheres.centerZ[i] - r.orig.getZ();
float nb = coX * r.dir.getX() + coY * r.dir.getY() + coZ * r.dir.getZ();
float c = coX * coX + coY * coY + coZ * coZ - spheres.sqRadius[i];
float discr = nb * nb - c;
if (discr > 0)
{
float discrSq = sqrtf(discr);
// Try earlier t
float t = nb - discrSq;
if (t <= tMin) // before min, try later t!
t = nb + discrSq;
if (t > tMin && t < hitT)
{
id = i;
hitT = t;
}
}
}
if (id != -1)
{
outHit.pos = r.pointAt(hitT);
outHit.normal = (outHit.pos - float3(spheres.centerX[id], spheres.centerY[id], spheres.centerZ[id])) * spheres.invRadius[id];
outHit.t = hitT;
return id;
}
else
return -1;
#endif // #else of #if DO_HIT_SPHERES_SIMD
}

View File

@@ -0,0 +1,436 @@
#pragma once
#include <math.h>
#include <assert.h>
#include <stdint.h>
#include "Config.h"
#include "MathSimd.h"
#define kPI 3.1415926f
// SSE/SIMD vector largely based on http://www.codersnotes.com/notes/maths-lib-2016/
#if DO_FLOAT3_WITH_SIMD
#if !defined(__arm__) && !defined(__arm64__)
// ---- SSE implementation
// SHUFFLE3(v, 0,1,2) leaves the vector unchanged (v.xyz).
// SHUFFLE3(v, 0,0,0) splats the X (v.xxx).
#define SHUFFLE3(V, X,Y,Z) float3(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(Z,Z,Y,X)))
struct float3
{
VM_INLINE float3() {}
VM_INLINE explicit float3(const float *p) { m = _mm_set_ps(p[2], p[2], p[1], p[0]); }
VM_INLINE explicit float3(float x, float y, float z) { m = _mm_set_ps(z, z, y, x); }
VM_INLINE explicit float3(float v) { m = _mm_set1_ps(v); }
VM_INLINE explicit float3(__m128 v) { m = v; }
VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }
VM_INLINE float3 yzx() const { return SHUFFLE3(*this, 1, 2, 0); }
VM_INLINE float3 zxy() const { return SHUFFLE3(*this, 2, 0, 1); }
VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
void setX(float x)
{
m = _mm_move_ss(m, _mm_set_ss(x));
}
void setY(float y)
{
__m128 t = _mm_move_ss(m, _mm_set_ss(y));
t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 2, 0, 0));
m = _mm_move_ss(t, m);
}
void setZ(float z)
{
__m128 t = _mm_move_ss(m, _mm_set_ss(z));
t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 0, 1, 0));
m = _mm_move_ss(t, m);
}
__m128 m;
};
typedef float3 bool3;
VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
VM_INLINE float3 operator- (float3 a, float3 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float3 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
VM_INLINE float3 operator/ (float3 a, float3 b) { a.m = _mm_div_ps(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float b) { a.m = _mm_mul_ps(a.m, _mm_set1_ps(b)); return a; }
VM_INLINE float3 operator/ (float3 a, float b) { a.m = _mm_div_ps(a.m, _mm_set1_ps(b)); return a; }
VM_INLINE float3 operator* (float a, float3 b) { b.m = _mm_mul_ps(_mm_set1_ps(a), b.m); return b; }
VM_INLINE float3 operator/ (float a, float3 b) { b.m = _mm_div_ps(_mm_set1_ps(a), b.m); return b; }
VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
VM_INLINE float3 min(float3 a, float3 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
VM_INLINE float3 max(float3 a, float3 b) { a.m = _mm_max_ps(a.m, b.m); return a; }
VM_INLINE float3 operator- (float3 a) { return float3(_mm_setzero_ps()) - a; }
VM_INLINE float hmin(float3 v)
{
v = min(v, SHUFFLE3(v, 1, 0, 2));
return min(v, SHUFFLE3(v, 2, 0, 1)).getX();
}
VM_INLINE float hmax(float3 v)
{
v = max(v, SHUFFLE3(v, 1, 0, 2));
return max(v, SHUFFLE3(v, 2, 0, 1)).getX();
}
VM_INLINE float3 cross(float3 a, float3 b)
{
// x <- a.y*b.z - a.z*b.y
// y <- a.z*b.x - a.x*b.z
// z <- a.x*b.y - a.y*b.x
// We can save a shuffle by grouping it in this wacky order:
return (a.zxy()*b - a*b.zxy()).zxy();
}
// Returns a 3-bit code where bit0..bit2 is X..Z
VM_INLINE unsigned mask(float3 v) { return _mm_movemask_ps(v.m) & 7; }
// Once we have a comparison, we can branch based on its results:
VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
VM_INLINE bool all(bool3 v) { return mask(v) == 7; }
VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }
#else // #if !defined(__arm__) && !defined(__arm64__)
// ---- NEON implementation
#include <arm_neon.h>
struct float3
{
VM_INLINE float3() {}
VM_INLINE explicit float3(const float *p) { float v[4] = {p[0], p[1], p[2], 0}; m = vld1q_f32(v); }
VM_INLINE explicit float3(float x, float y, float z) { float v[4] = {x, y, z, 0}; m = vld1q_f32(v); }
VM_INLINE explicit float3(float v) { m = vdupq_n_f32(v); }
VM_INLINE explicit float3(float32x4_t v) { m = v; }
VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
VM_INLINE float3 yzx() const
{
float32x2_t low = vget_low_f32(m);
float32x4_t yzx = vcombine_f32(vext_f32(low, vget_high_f32(m), 1), low);
return float3(yzx);
}
VM_INLINE float3 zxy() const
{
float32x4_t p = m;
p = vuzpq_f32(vreinterpretq_f32_s32(vextq_s32(vreinterpretq_s32_f32(p), vreinterpretq_s32_f32(p), 1)), p).val[1];
return float3(p);
}
VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
void setX(float x)
{
m = vsetq_lane_f32(x, m, 0);
}
void setY(float y)
{
m = vsetq_lane_f32(y, m, 1);
}
void setZ(float z)
{
m = vsetq_lane_f32(z, m, 2);
}
float32x4_t m;
};
typedef float3 bool3;
VM_INLINE float32x4_t rcp_2(float32x4_t v)
{
float32x4_t e = vrecpeq_f32(v);
e = vmulq_f32(vrecpsq_f32(e, v), e);
e = vmulq_f32(vrecpsq_f32(e, v), e);
return e;
}
VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = vaddq_f32(a.m, b.m); return a; }
VM_INLINE float3 operator- (float3 a, float3 b) { a.m = vsubq_f32(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float3 b) { a.m = vmulq_f32(a.m, b.m); return a; }
VM_INLINE float3 operator/ (float3 a, float3 b) { float32x4_t recip = rcp_2(b.m); a.m = vmulq_f32(a.m, recip); return a; }
VM_INLINE float3 operator* (float3 a, float b) { a.m = vmulq_f32(a.m, vdupq_n_f32(b)); return a; }
VM_INLINE float3 operator/ (float3 a, float b) { float32x4_t recip = rcp_2(vdupq_n_f32(b)); a.m = vmulq_f32(a.m, recip); return a; }
VM_INLINE float3 operator* (float a, float3 b) { b.m = vmulq_f32(vdupq_n_f32(a), b.m); return b; }
VM_INLINE float3 operator/ (float a, float3 b) { float32x4_t recip = rcp_2(b.m); b.m = vmulq_f32(vdupq_n_f32(a), recip); return b; }
VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = vceqq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = vcltq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = vcleq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
VM_INLINE float3 min(float3 a, float3 b) { a.m = vminq_f32(a.m, b.m); return a; }
VM_INLINE float3 max(float3 a, float3 b) { a.m = vmaxq_f32(a.m, b.m); return a; }
VM_INLINE float3 operator- (float3 a) { a.m = vnegq_f32(a.m); return a; }
VM_INLINE float hmin(float3 v)
{
float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
return vget_lane_f32(minOfMinOfHalfs, 0);
}
VM_INLINE float hmax(float3 v)
{
float32x2_t maxOfHalfs = vpmax_f32(vget_low_f32(v.m), vget_high_f32(v.m));
float32x2_t maxOfMaxOfHalfs = vpmax_f32(maxOfHalfs, maxOfHalfs);
return vget_lane_f32(maxOfMaxOfHalfs, 0);
}
VM_INLINE float3 cross(float3 a, float3 b)
{
// x <- a.y*b.z - a.z*b.y
// y <- a.z*b.x - a.x*b.z
// z <- a.x*b.y - a.y*b.x
// We can save a shuffle by grouping it in this wacky order:
return (a.zxy()*b - a*b.zxy()).zxy();
}
// Returns a 3-bit code where bit0..bit2 is X..Z
VM_INLINE unsigned mask(float3 v)
{
static const uint32x4_t movemask = { 1, 2, 4, 8 };
static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
uint32x4_t t1 = vtstq_u32(t0, highbit);
uint32x4_t t2 = vandq_u32(t1, movemask);
uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
}
// Once we have a comparison, we can branch based on its results:
VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
VM_INLINE bool all(bool3 v) { return mask(v) == 7; }
VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }
#endif // #else of #if !defined(__arm__) && !defined(__arm64__)
#else // #if DO_FLOAT3_WITH_SIMD
// ---- Simple scalar C implementation
struct float3
{
float3() : x(0), y(0), z(0) {}
float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
float3 operator-() const { return float3(-x, -y, -z); }
float3& operator+=(const float3& o) { x+=o.x; y+=o.y; z+=o.z; return *this; }
float3& operator-=(const float3& o) { x-=o.x; y-=o.y; z-=o.z; return *this; }
float3& operator*=(const float3& o) { x*=o.x; y*=o.y; z*=o.z; return *this; }
float3& operator*=(float o) { x*=o; y*=o; z*=o; return *this; }
VM_INLINE float getX() const { return x; }
VM_INLINE float getY() const { return y; }
VM_INLINE float getZ() const { return z; }
VM_INLINE void setX(float x_) { x = x_; }
VM_INLINE void setY(float y_) { y = y_; }
VM_INLINE void setZ(float z_) { z = z_; }
VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
float x, y, z;
};
VM_INLINE float3 operator+(const float3& a, const float3& b) { return float3(a.x+b.x,a.y+b.y,a.z+b.z); }
VM_INLINE float3 operator-(const float3& a, const float3& b) { return float3(a.x-b.x,a.y-b.y,a.z-b.z); }
VM_INLINE float3 operator*(const float3& a, const float3& b) { return float3(a.x*b.x,a.y*b.y,a.z*b.z); }
VM_INLINE float3 operator*(const float3& a, float b) { return float3(a.x*b,a.y*b,a.z*b); }
VM_INLINE float3 operator*(float a, const float3& b) { return float3(a*b.x,a*b.y,a*b.z); }
VM_INLINE float dot(const float3& a, const float3& b) { return a.x*b.x+a.y*b.y+a.z*b.z; }
VM_INLINE float3 cross(const float3& a, const float3& b)
{
return float3(
a.y*b.z - a.z*b.y,
-(a.x*b.z - a.z*b.x),
a.x*b.y - a.y*b.x
);
}
#endif // #else of #if DO_FLOAT3_WITH_SIMD
VM_INLINE float length(float3 v) { return sqrtf(dot(v, v)); }
VM_INLINE float sqLength(float3 v) { return dot(v, v); }
VM_INLINE float3 normalize(float3 v) { return v * (1.0f / length(v)); }
VM_INLINE float3 lerp(float3 a, float3 b, float t) { return a + (b-a)*t; }
inline void AssertUnit(float3 v)
{
assert(fabsf(sqLength(v) - 1.0f) < 0.01f);
}
inline float3 reflect(float3 v, float3 n)
{
return v - 2*dot(v,n)*n;
}
inline bool refract(float3 v, float3 n, float nint, float3& outRefracted)
{
AssertUnit(v);
float dt = dot(v, n);
float discr = 1.0f - nint*nint*(1-dt*dt);
if (discr > 0)
{
outRefracted = nint * (v - n*dt) - n*sqrtf(discr);
return true;
}
return false;
}
inline float schlick(float cosine, float ri)
{
float r0 = (1-ri) / (1+ri);
r0 = r0*r0;
return r0 + (1-r0)*powf(1-cosine, 5);
}
struct Ray
{
Ray() {}
Ray(float3 orig_, float3 dir_) : orig(orig_), dir(dir_) { AssertUnit(dir); }
float3 pointAt(float t) const { return orig + dir * t; }
float3 orig;
float3 dir;
};
struct Hit
{
float3 pos;
float3 normal;
float t;
};
struct Sphere
{
Sphere() : radius(1.0f), invRadius(0.0f) {}
Sphere(float3 center_, float radius_) : center(center_), radius(radius_), invRadius(0.0f) {}
void UpdateDerivedData() { invRadius = 1.0f/radius; }
float3 center;
float radius;
float invRadius;
};
// data for all spheres in a "structure of arrays" layout
struct SpheresSoA
{
SpheresSoA(int c)
{
count = c;
// we'll be processing spheres in kSimdWidth chunks, so make sure to allocate
// enough space
simdCount = (c + (kSimdWidth - 1)) / kSimdWidth * kSimdWidth;
centerX = new float[simdCount];
centerY = new float[simdCount];
centerZ = new float[simdCount];
sqRadius = new float[simdCount];
invRadius = new float[simdCount];
// set all data to "impossible sphere" state
for (int i = count; i < simdCount; ++i)
{
centerX[i] = centerY[i] = centerZ[i] = 10000.0f;
sqRadius[i] = 0.0f;
invRadius[i] = 0.0f;
}
}
~SpheresSoA()
{
delete[] centerX;
delete[] centerY;
delete[] centerZ;
delete[] sqRadius;
delete[] invRadius;
}
float* centerX;
float* centerY;
float* centerZ;
float* sqRadius;
float* invRadius;
int simdCount;
int count;
};
int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit);
float RandomFloat01(uint32_t& state);
float3 RandomInUnitDisk(uint32_t& state);
float3 RandomInUnitSphere(uint32_t& state);
float3 RandomUnitVector(uint32_t& state);
struct Camera
{
Camera() {}
// vfov is top to bottom in degrees
Camera(const float3& lookFrom, const float3& lookAt, const float3& vup, float vfov, float aspect, float aperture, float focusDist)
{
lensRadius = aperture / 2;
float theta = vfov*kPI/180;
float halfHeight = tanf(theta/2);
float halfWidth = aspect * halfHeight;
origin = lookFrom;
w = normalize(lookFrom - lookAt);
u = normalize(cross(vup, w));
v = cross(w, u);
lowerLeftCorner = origin - halfWidth*focusDist*u - halfHeight*focusDist*v - focusDist*w;
horizontal = 2*halfWidth*focusDist*u;
vertical = 2*halfHeight*focusDist*v;
}
Ray GetRay(float s, float t, uint32_t& state) const
{
float3 rd = lensRadius * RandomInUnitDisk(state);
float3 offset = u * rd.getX() + v * rd.getY();
return Ray(origin + offset, normalize(lowerLeftCorner + s*horizontal + t*vertical - origin - offset));
}
float3 origin;
float3 lowerLeftCorner;
float3 horizontal;
float3 vertical;
float3 u, v, w;
float lensRadius;
};

View File

@@ -0,0 +1,392 @@
#include "Config.h"
#include "Test.h"
#include "Maths.h"
#include <algorithm>
#if CPU_CAN_DO_THREADS
#include "enkiTS/TaskScheduler_c.h"
#include <thread>
#endif
#include <atomic>
#include "../../../public/tracy/Tracy.hpp"
// 46 spheres (2 emissive) when enabled; 9 spheres (1 emissive) when disabled
#define DO_BIG_SCENE 1
static Sphere s_Spheres[] =
{
{float3(0,-100.5,-1), 100},
{float3(2,0,-1), 0.5f},
{float3(0,0,-1), 0.5f},
{float3(-2,0,-1), 0.5f},
{float3(2,0,1), 0.5f},
{float3(0,0,1), 0.5f},
{float3(-2,0,1), 0.5f},
{float3(0.5f,1,0.5f), 0.5f},
{float3(-1.5f,1.5f,0.f), 0.3f},
#if DO_BIG_SCENE
{float3(4,0,-3), 0.5f}, {float3(3,0,-3), 0.5f}, {float3(2,0,-3), 0.5f}, {float3(1,0,-3), 0.5f}, {float3(0,0,-3), 0.5f}, {float3(-1,0,-3), 0.5f}, {float3(-2,0,-3), 0.5f}, {float3(-3,0,-3), 0.5f}, {float3(-4,0,-3), 0.5f},
{float3(4,0,-4), 0.5f}, {float3(3,0,-4), 0.5f}, {float3(2,0,-4), 0.5f}, {float3(1,0,-4), 0.5f}, {float3(0,0,-4), 0.5f}, {float3(-1,0,-4), 0.5f}, {float3(-2,0,-4), 0.5f}, {float3(-3,0,-4), 0.5f}, {float3(-4,0,-4), 0.5f},
{float3(4,0,-5), 0.5f}, {float3(3,0,-5), 0.5f}, {float3(2,0,-5), 0.5f}, {float3(1,0,-5), 0.5f}, {float3(0,0,-5), 0.5f}, {float3(-1,0,-5), 0.5f}, {float3(-2,0,-5), 0.5f}, {float3(-3,0,-5), 0.5f}, {float3(-4,0,-5), 0.5f},
{float3(4,0,-6), 0.5f}, {float3(3,0,-6), 0.5f}, {float3(2,0,-6), 0.5f}, {float3(1,0,-6), 0.5f}, {float3(0,0,-6), 0.5f}, {float3(-1,0,-6), 0.5f}, {float3(-2,0,-6), 0.5f}, {float3(-3,0,-6), 0.5f}, {float3(-4,0,-6), 0.5f},
{float3(1.5f,1.5f,-2), 0.3f},
#endif // #if DO_BIG_SCENE
};
const int kSphereCount = sizeof(s_Spheres) / sizeof(s_Spheres[0]);
static SpheresSoA s_SpheresSoA(kSphereCount);
struct Material
{
enum Type { Lambert, Metal, Dielectric };
Type type;
float3 albedo;
float3 emissive;
float roughness;
float ri;
};
static Material s_SphereMats[kSphereCount] =
{
{ Material::Lambert, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, },
{ Material::Lambert, float3(0.8f, 0.4f, 0.4f), float3(0,0,0), 0, 0, },
{ Material::Lambert, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0, 0, },
{ Material::Metal, float3(0.4f, 0.4f, 0.8f), float3(0,0,0), 0, 0 },
{ Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0, 0 },
{ Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0.2f, 0 },
{ Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0.6f, 0 },
{ Material::Dielectric, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 1.5f },
{ Material::Lambert, float3(0.8f, 0.6f, 0.2f), float3(30,25,15), 0, 0 },
#if DO_BIG_SCENE
{ Material::Lambert, float3(0.1f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.2f, 0.2f, 0.2f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.3f, 0.3f, 0.3f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.5f, 0.5f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.6f, 0.6f, 0.6f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.7f, 0.7f, 0.7f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.9f, 0.9f, 0.9f), float3(0,0,0), 0, 0, },
{ Material::Metal, float3(0.1f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.2f, 0.2f, 0.2f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.3f, 0.3f, 0.3f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.5f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.6f, 0.6f, 0.6f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.7f, 0.7f, 0.7f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.9f, 0.9f, 0.9f), float3(0,0,0), 0, 0, },
{ Material::Metal, float3(0.8f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.5f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.4f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.1f, 0.8f), float3(0,0,0), 0, 0, },
{ Material::Lambert, float3(0.8f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.5f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.4f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.1f, 0.8f), float3(0,0,0), 0, 0, },
{ Material::Lambert, float3(0.1f, 0.2f, 0.5f), float3(3,10,20), 0, 0 },
#endif
};
static int s_EmissiveSpheres[kSphereCount];
static int s_EmissiveSphereCount;
static Camera s_Cam;
const float kMinT = 0.001f;
const float kMaxT = 1.0e7f;
const int kMaxDepth = 10;
bool HitWorld(const Ray& r, float tMin, float tMax, Hit& outHit, int& outID)
{
outID = HitSpheres(r, s_SpheresSoA, tMin, tMax, outHit);
return outID != -1;
}
static bool Scatter(const Material& mat, const Ray& r_in, const Hit& rec, float3& attenuation, Ray& scattered, float3& outLightE, int& inoutRayCount, uint32_t& state)
{
ZoneScoped;
outLightE = float3(0,0,0);
if (mat.type == Material::Lambert)
{
// random point on unit sphere that is tangent to the hit point
float3 target = rec.pos + rec.normal + RandomUnitVector(state);
scattered = Ray(rec.pos, normalize(target - rec.pos));
attenuation = mat.albedo;
// sample lights
#if DO_LIGHT_SAMPLING
for (int j = 0; j < s_EmissiveSphereCount; ++j)
{
int i = s_EmissiveSpheres[j];
const Material& smat = s_SphereMats[i];
if (&mat == &smat)
continue; // skip self
const Sphere& s = s_Spheres[i];
// create a random direction towards sphere
// coord system for sampling: sw, su, sv
float3 sw = normalize(s.center - rec.pos);
float3 su = normalize(cross(fabs(sw.getX())>0.01f ? float3(0,1,0):float3(1,0,0), sw));
float3 sv = cross(sw, su);
// sample sphere by solid angle
float cosAMax = sqrtf(1.0f - s.radius*s.radius / sqLength(rec.pos-s.center));
float eps1 = RandomFloat01(state), eps2 = RandomFloat01(state);
float cosA = 1.0f - eps1 + eps1 * cosAMax;
float sinA = sqrtf(1.0f - cosA*cosA);
float phi = 2 * kPI * eps2;
float3 l = su * (cosf(phi) * sinA) + sv * (sinf(phi) * sinA) + sw * cosA;
//l = normalize(l); // NOTE(fg): This is already normalized, by construction.
// shoot shadow ray
Hit lightHit;
int hitID;
++inoutRayCount;
if (HitWorld(Ray(rec.pos, l), kMinT, kMaxT, lightHit, hitID) && hitID == i)
{
float omega = 2 * kPI * (1-cosAMax);
float3 rdir = r_in.dir;
AssertUnit(rdir);
float3 nl = dot(rec.normal, rdir) < 0 ? rec.normal : -rec.normal;
outLightE += (mat.albedo * smat.emissive) * (std::max(0.0f, dot(l, nl)) * omega / kPI);
}
}
#endif
return true;
}
else if (mat.type == Material::Metal)
{
AssertUnit(r_in.dir); AssertUnit(rec.normal);
float3 refl = reflect(r_in.dir, rec.normal);
// reflected ray, and random inside of sphere based on roughness
float roughness = mat.roughness;
#if DO_MITSUBA_COMPARE
roughness = 0; // until we get better BRDF for metals
#endif
scattered = Ray(rec.pos, normalize(refl + roughness*RandomInUnitSphere(state)));
attenuation = mat.albedo;
return dot(scattered.dir, rec.normal) > 0;
}
else if (mat.type == Material::Dielectric)
{
AssertUnit(r_in.dir); AssertUnit(rec.normal);
float3 outwardN;
float3 rdir = r_in.dir;
float3 refl = reflect(rdir, rec.normal);
float nint;
attenuation = float3(1,1,1);
float3 refr;
float reflProb;
float cosine;
if (dot(rdir, rec.normal) > 0)
{
outwardN = -rec.normal;
nint = mat.ri;
cosine = mat.ri * dot(rdir, rec.normal);
}
else
{
outwardN = rec.normal;
nint = 1.0f / mat.ri;
cosine = -dot(rdir, rec.normal);
}
if (refract(rdir, outwardN, nint, refr))
{
reflProb = schlick(cosine, mat.ri);
}
else
{
reflProb = 1;
}
if (RandomFloat01(state) < reflProb)
scattered = Ray(rec.pos, normalize(refl));
else
scattered = Ray(rec.pos, normalize(refr));
}
else
{
attenuation = float3(1,0,1);
return false;
}
return true;
}
static float3 Trace(const Ray& r, int depth, int& inoutRayCount, uint32_t& state, bool doMaterialE = true)
{
ZoneScoped;
Hit rec;
int id = 0;
++inoutRayCount;
if (HitWorld(r, kMinT, kMaxT, rec, id))
{
Ray scattered;
float3 attenuation;
float3 lightE;
const Material& mat = s_SphereMats[id];
float3 matE = mat.emissive;
if (depth < kMaxDepth && Scatter(mat, r, rec, attenuation, scattered, lightE, inoutRayCount, state))
{
#if DO_LIGHT_SAMPLING
if (!doMaterialE) matE = float3(0,0,0); // don't add material emission if told so
// dor Lambert materials, we just did explicit light (emissive) sampling and already
// for their contribution, so if next ray bounce hits the light again, don't add
// emission
doMaterialE = (mat.type != Material::Lambert);
#endif
return matE + lightE + attenuation * Trace(scattered, depth+1, inoutRayCount, state, doMaterialE);
}
else
{
return matE;
}
}
else
{
// sky
#if DO_MITSUBA_COMPARE
return float3(0.15f,0.21f,0.3f); // easier compare with Mitsuba's constant environment light
#else
float3 unitDir = r.dir;
float t = 0.5f*(unitDir.getY() + 1.0f);
return ((1.0f-t)*float3(1.0f, 1.0f, 1.0f) + t*float3(0.5f, 0.7f, 1.0f)) * 0.3f;
#endif
}
}
#if CPU_CAN_DO_THREADS
static enkiTaskScheduler* g_TS;
#endif
void InitializeTest()
{
ZoneScoped;
#if CPU_CAN_DO_THREADS
g_TS = enkiNewTaskScheduler();
enkiInitTaskSchedulerNumThreads(g_TS, std::max<int>( 2, std::thread::hardware_concurrency() - 2));
#endif
}
void ShutdownTest()
{
ZoneScoped;
#if CPU_CAN_DO_THREADS
enkiDeleteTaskScheduler(g_TS);
#endif
}
struct JobData
{
float time;
int frameCount;
int screenWidth, screenHeight;
float* backbuffer;
Camera* cam;
std::atomic<int> rayCount;
unsigned testFlags;
};
static void TraceRowJob(uint32_t start, uint32_t end, uint32_t threadnum, void* data_)
{
ZoneScoped;
JobData& data = *(JobData*)data_;
float* backbuffer = data.backbuffer + start * data.screenWidth * 4;
float invWidth = 1.0f / data.screenWidth;
float invHeight = 1.0f / data.screenHeight;
float lerpFac = float(data.frameCount) / float(data.frameCount+1);
if (data.testFlags & kFlagAnimate)
lerpFac *= DO_ANIMATE_SMOOTHING;
if (!(data.testFlags & kFlagProgressive))
lerpFac = 0;
int rayCount = 0;
for (uint32_t y = start; y < end; ++y)
{
uint32_t state = (y * 9781 + data.frameCount * 6271) | 1;
for (int x = 0; x < data.screenWidth; ++x)
{
float3 col(0, 0, 0);
for (int s = 0; s < DO_SAMPLES_PER_PIXEL; s++)
{
float u = float(x + RandomFloat01(state)) * invWidth;
float v = float(y + RandomFloat01(state)) * invHeight;
Ray r = data.cam->GetRay(u, v, state);
col += Trace(r, 0, rayCount, state);
}
col *= 1.0f / float(DO_SAMPLES_PER_PIXEL);
float3 prev(backbuffer[0], backbuffer[1], backbuffer[2]);
col = prev * lerpFac + col * (1-lerpFac);
col.store(backbuffer);
backbuffer += 4;
}
}
data.rayCount += rayCount;
}
void UpdateTest(float time, int frameCount, int screenWidth, int screenHeight, unsigned testFlags)
{
ZoneScoped;
if (testFlags & kFlagAnimate)
{
s_Spheres[1].center.setY(cosf(time) + 1.0f);
s_Spheres[8].center.setZ(sinf(time)*0.3f);
}
float3 lookfrom(0, 2, 3);
float3 lookat(0, 0, 0);
float distToFocus = 3;
#if DO_MITSUBA_COMPARE
float aperture = 0.0f;
#else
float aperture = 0.1f;
#endif
#if DO_BIG_SCENE
aperture *= 0.2f;
#endif
s_EmissiveSphereCount = 0;
for (int i = 0; i < kSphereCount; ++i)
{
Sphere& s = s_Spheres[i];
s.UpdateDerivedData();
s_SpheresSoA.centerX[i] = s.center.getX();
s_SpheresSoA.centerY[i] = s.center.getY();
s_SpheresSoA.centerZ[i] = s.center.getZ();
s_SpheresSoA.sqRadius[i] = s.radius * s.radius;
s_SpheresSoA.invRadius[i] = s.invRadius;
// Remember IDs of emissive spheres (light sources)
const Material& smat = s_SphereMats[i];
if (smat.emissive.getX() > 0 || smat.emissive.getY() > 0 || smat.emissive.getZ() > 0)
{
s_EmissiveSpheres[s_EmissiveSphereCount] = i;
s_EmissiveSphereCount++;
}
}
s_Cam = Camera(lookfrom, lookat, float3(0, 1, 0), 60, float(screenWidth) / float(screenHeight), aperture, distToFocus);
}
void DrawTest(float time, int frameCount, int screenWidth, int screenHeight, float* backbuffer, int& outRayCount, unsigned testFlags)
{
ZoneScoped;
JobData args;
args.time = time;
args.frameCount = frameCount;
args.screenWidth = screenWidth;
args.screenHeight = screenHeight;
args.backbuffer = backbuffer;
args.cam = &s_Cam;
args.testFlags = testFlags;
args.rayCount = 0;
#if CPU_CAN_DO_THREADS
enkiTaskSet* task = enkiCreateTaskSet(g_TS, TraceRowJob);
bool threaded = true;
enkiAddTaskSetToPipeMinRange(g_TS, task, &args, screenHeight, threaded ? 4 : screenHeight);
enkiWaitForTaskSet(g_TS, task);
enkiDeleteTaskSet(task);
#else
TraceRowJob(0, screenHeight, 0, &args);
#endif
outRayCount = args.rayCount;
}
void GetObjectCount(int& outCount, int& outObjectSize, int& outMaterialSize, int& outCamSize)
{
ZoneScoped;
outCount = kSphereCount;
outObjectSize = sizeof(Sphere);
outMaterialSize = sizeof(Material);
outCamSize = sizeof(Camera);
}
void GetSceneDesc(void* outObjects, void* outMaterials, void* outCam, void* outEmissives, int* outEmissiveCount)
{
ZoneScoped;
memcpy(outObjects, s_Spheres, kSphereCount * sizeof(s_Spheres[0]));
memcpy(outMaterials, s_SphereMats, kSphereCount * sizeof(s_SphereMats[0]));
memcpy(outCam, &s_Cam, sizeof(s_Cam));
memcpy(outEmissives, s_EmissiveSpheres, s_EmissiveSphereCount * sizeof(s_EmissiveSpheres[0]));
*outEmissiveCount = s_EmissiveSphereCount;
}

View File

@@ -0,0 +1,17 @@
#pragma once
#include <stdint.h>
enum TestFlags
{
kFlagAnimate = (1 << 0),
kFlagProgressive = (1 << 1),
};
void InitializeTest();
void ShutdownTest();
void UpdateTest(float time, int frameCount, int screenWidth, int screenHeight, unsigned testFlags);
void DrawTest(float time, int frameCount, int screenWidth, int screenHeight, float* backbuffer, int& outRayCount, unsigned testFlags);
void GetObjectCount(int& outCount, int& outObjectSize, int& outMaterialSize, int& outCamSize);
void GetSceneDesc(void* outObjects, void* outMaterials, void* outCam, void* outEmissives, int* outEmissiveCount);

View File

@@ -0,0 +1,79 @@
// Copyright (c) 2013 Doug Binks
//
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgement in the product documentation would be
// appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
// misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source distribution.
#pragma once
#include <stdint.h>
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#undef GetObject
#include <intrin.h>
extern "C" void _ReadWriteBarrier();
#pragma intrinsic(_ReadWriteBarrier)
#pragma intrinsic(_InterlockedCompareExchange)
#pragma intrinsic(_InterlockedExchangeAdd)
// Memory Barriers to prevent CPU and Compiler re-ordering
#define BASE_MEMORYBARRIER_ACQUIRE() _ReadWriteBarrier()
#define BASE_MEMORYBARRIER_RELEASE() _ReadWriteBarrier()
#define BASE_ALIGN(x) __declspec( align( x ) )
#else
#define BASE_MEMORYBARRIER_ACQUIRE() __asm__ __volatile__("": : :"memory")
#define BASE_MEMORYBARRIER_RELEASE() __asm__ __volatile__("": : :"memory")
#define BASE_ALIGN(x) __attribute__ ((aligned( x )))
#endif
namespace enki
{
// Atomically performs: if( *pDest == compareWith ) { *pDest = swapTo; }
// returns old *pDest (so if successfull, returns compareWith)
inline uint32_t AtomicCompareAndSwap( volatile uint32_t* pDest, uint32_t swapTo, uint32_t compareWith )
{
#ifdef _WIN32
// assumes two's complement - unsigned / signed conversion leads to same bit pattern
return _InterlockedCompareExchange( (volatile long*)pDest,swapTo, compareWith );
#else
return __sync_val_compare_and_swap( pDest, compareWith, swapTo );
#endif
}
inline uint64_t AtomicCompareAndSwap( volatile uint64_t* pDest, uint64_t swapTo, uint64_t compareWith )
{
#ifdef _WIN32
// assumes two's complement - unsigned / signed conversion leads to same bit pattern
return _InterlockedCompareExchange64( (__int64 volatile*)pDest, swapTo, compareWith );
#else
return __sync_val_compare_and_swap( pDest, compareWith, swapTo );
#endif
}
// Atomically performs: tmp = *pDest; *pDest += value; return tmp;
inline int32_t AtomicAdd( volatile int32_t* pDest, int32_t value )
{
#ifdef _WIN32
return _InterlockedExchangeAdd( (long*)pDest, value );
#else
return __sync_fetch_and_add( pDest, value );
#endif
}
}

View File

@@ -0,0 +1,240 @@
// Copyright (c) 2013 Doug Binks
//
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgement in the product documentation would be
// appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
// misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source distribution.
#pragma once
#include <stdint.h>
#include <assert.h>
#include "Atomics.h"
#include <string.h>
namespace enki
{
// LockLessMultiReadPipe - Single writer, multiple reader thread safe pipe using (semi) lockless programming
// Readers can only read from the back of the pipe
// The single writer can write to the front of the pipe, and read from both ends (a writer can be a reader)
// for many of the principles used here, see http://msdn.microsoft.com/en-us/library/windows/desktop/ee418650(v=vs.85).aspx
// Note: using log2 sizes so we do not need to clamp (multi-operation)
// T is the contained type
// Note this is not true lockless as the use of flags as a form of lock state.
template<uint8_t cSizeLog2, typename T> class LockLessMultiReadPipe
{
public:
LockLessMultiReadPipe();
~LockLessMultiReadPipe() {}
// ReaderTryReadBack returns false if we were unable to read
// This is thread safe for both multiple readers and the writer
bool ReaderTryReadBack( T* pOut );
// WriterTryReadFront returns false if we were unable to read
// This is thread safe for the single writer, but should not be called by readers
bool WriterTryReadFront( T* pOut );
// WriterTryWriteFront returns false if we were unable to write
// This is thread safe for the single writer, but should not be called by readers
bool WriterTryWriteFront( const T& in );
// IsPipeEmpty() is a utility function, not intended for general use
// Should only be used very prudently.
bool IsPipeEmpty() const
{
return 0 == m_WriteIndex - m_ReadCount;
}
void Clear()
{
m_WriteIndex = 0;
m_ReadIndex = 0;
m_ReadCount = 0;
memset( (void*)m_Flags, 0, sizeof( m_Flags ) );
}
private:
const static uint32_t ms_cSize = ( 1 << cSizeLog2 );
const static uint32_t ms_cIndexMask = ms_cSize - 1;
const static uint32_t FLAG_INVALID = 0xFFFFFFFF; // 32bit for CAS
const static uint32_t FLAG_CAN_WRITE = 0x00000000; // 32bit for CAS
const static uint32_t FLAG_CAN_READ = 0x11111111; // 32bit for CAS
T m_Buffer[ ms_cSize ];
// read and write indexes allow fast access to the pipe, but actual access
// controlled by the access flags.
volatile uint32_t BASE_ALIGN(4) m_WriteIndex;
volatile uint32_t BASE_ALIGN(4) m_ReadCount;
volatile uint32_t m_Flags[ ms_cSize ];
volatile uint32_t BASE_ALIGN(4) m_ReadIndex;
};
template<uint8_t cSizeLog2, typename T> inline
LockLessMultiReadPipe<cSizeLog2,T>::LockLessMultiReadPipe()
: m_WriteIndex(0)
, m_ReadIndex(0)
, m_ReadCount(0)
{
assert( cSizeLog2 < 32 );
memset( (void*)m_Flags, 0, sizeof( m_Flags ) );
}
template<uint8_t cSizeLog2, typename T> inline
bool LockLessMultiReadPipe<cSizeLog2,T>::ReaderTryReadBack( T* pOut )
{
uint32_t actualReadIndex;
uint32_t readCount = m_ReadCount;
// We get hold of read index for consistency,
// and do first pass starting at read count
uint32_t readIndexToUse = readCount;
while(true)
{
uint32_t writeIndex = m_WriteIndex;
// power of two sizes ensures we can use a simple calc without modulus
uint32_t numInPipe = writeIndex - readCount;
if( 0 == numInPipe )
{
return false;
}
if( readIndexToUse >= writeIndex )
{
// move back to start
readIndexToUse = m_ReadIndex;
}
// power of two sizes ensures we can perform AND for a modulus
actualReadIndex = readIndexToUse & ms_cIndexMask;
// Multiple potential readers mean we should check if the data is valid,
// using an atomic compare exchange
uint32_t previous = AtomicCompareAndSwap( &m_Flags[ actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ );
if( FLAG_CAN_READ == previous )
{
break;
}
++readIndexToUse;
//update known readcount
readCount = m_ReadCount;
}
// we update the read index using an atomic add, as we've only read one piece of data.
// this ensure consistency of the read index, and the above loop ensures readers
// only read from unread data
AtomicAdd( (volatile int32_t*)&m_ReadCount, 1 );
BASE_MEMORYBARRIER_ACQUIRE();
// now read data, ensuring we do so after above reads & CAS
*pOut = m_Buffer[ actualReadIndex ];
m_Flags[ actualReadIndex ] = FLAG_CAN_WRITE;
return true;
}
template<uint8_t cSizeLog2, typename T> inline
bool LockLessMultiReadPipe<cSizeLog2,T>::WriterTryReadFront( T* pOut )
{
uint32_t writeIndex = m_WriteIndex;
uint32_t frontReadIndex = writeIndex;
// Multiple potential readers mean we should check if the data is valid,
// using an atomic compare exchange - which acts as a form of lock (so not quite lockless really).
uint32_t previous = FLAG_INVALID;
uint32_t actualReadIndex = 0;
while( true )
{
// power of two sizes ensures we can use a simple calc without modulus
uint32_t readCount = m_ReadCount;
uint32_t numInPipe = writeIndex - readCount;
if( 0 == numInPipe || 0 == frontReadIndex )
{
// frontReadIndex can get to 0 here if that item was just being read by another thread.
m_ReadIndex = readCount;
return false;
}
--frontReadIndex;
actualReadIndex = frontReadIndex & ms_cIndexMask;
previous = AtomicCompareAndSwap( &m_Flags[ actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ );
if( FLAG_CAN_READ == previous )
{
break;
}
else if( m_ReadIndex >= frontReadIndex )
{
return false;
}
}
// now read data, ensuring we do so after above reads & CAS
*pOut = m_Buffer[ actualReadIndex ];
m_Flags[ actualReadIndex ] = FLAG_CAN_WRITE;
BASE_MEMORYBARRIER_RELEASE();
// 32-bit aligned stores are atomic, and writer owns the write index
// we only move one back as this is as many as we have read, not where we have read from.
--m_WriteIndex;
return true;
}
template<uint8_t cSizeLog2, typename T> inline
bool LockLessMultiReadPipe<cSizeLog2,T>::WriterTryWriteFront( const T& in )
{
// The writer 'owns' the write index, and readers can only reduce
// the amount of data in the pipe.
// We get hold of both values for consistency and to reduce false sharing
// impacting more than one access
uint32_t writeIndex = m_WriteIndex;
// power of two sizes ensures we can perform AND for a modulus
uint32_t actualWriteIndex = writeIndex & ms_cIndexMask;
// a reader may still be reading this item, as there are multiple readers
if( m_Flags[ actualWriteIndex ] != FLAG_CAN_WRITE )
{
return false; // still being read, so have caught up with tail.
}
// as we are the only writer we can update the data without atomics
// whilst the write index has not been updated
m_Buffer[ actualWriteIndex ] = in;
m_Flags[ actualWriteIndex ] = FLAG_CAN_READ;
// We need to ensure the above writes occur prior to updating the write index,
// otherwise another thread might read before it's finished
BASE_MEMORYBARRIER_RELEASE();
// 32-bit aligned stores are atomic, and the writer controls the write index
++writeIndex;
m_WriteIndex = writeIndex;
return true;
}
}

View File

@@ -0,0 +1,437 @@
// Copyright (c) 2013 Doug Binks
//
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgement in the product documentation would be
// appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
// misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source distribution.
#include <assert.h>
#include "TaskScheduler.h"
#include "LockLessMultiReadPipe.h"
using namespace enki;
static const uint32_t PIPESIZE_LOG2 = 8;
static const uint32_t SPIN_COUNT = 100;
static const uint32_t SPIN_BACKOFF_MULTIPLIER = 10;
static const uint32_t MAX_NUM_INITIAL_PARTITIONS = 8;
// each software thread gets it's own copy of gtl_threadNum, so this is safe to use as a static variable
static THREAD_LOCAL uint32_t gtl_threadNum = 0;
namespace enki
{
struct SubTaskSet
{
ITaskSet* pTask;
TaskSetPartition partition;
};
// we derive class TaskPipe rather than typedef to get forward declaration working easily
class TaskPipe : public LockLessMultiReadPipe<PIPESIZE_LOG2,enki::SubTaskSet> {};
struct ThreadArgs
{
uint32_t threadNum;
TaskScheduler* pTaskScheduler;
};
}
namespace
{
SubTaskSet SplitTask( SubTaskSet& subTask_, uint32_t rangeToSplit_ )
{
SubTaskSet splitTask = subTask_;
uint32_t rangeLeft = subTask_.partition.end - subTask_.partition.start;
if( rangeToSplit_ > rangeLeft )
{
rangeToSplit_ = rangeLeft;
}
splitTask.partition.end = subTask_.partition.start + rangeToSplit_;
subTask_.partition.start = splitTask.partition.end;
return splitTask;
}
#if defined _WIN32
#if defined _M_IX86 || defined _M_X64
#pragma intrinsic(_mm_pause)
inline void Pause() { _mm_pause(); }
#endif
#elif defined __i386__ || defined __x86_64__
inline void Pause() { __asm__ __volatile__("pause;"); }
#else
inline void Pause() { ;} // may have NOP or yield equiv
#endif
}
static void SafeCallback(ProfilerCallbackFunc func_, uint32_t threadnum_)
{
if( func_ )
{
func_(threadnum_);
}
}
ProfilerCallbacks* TaskScheduler::GetProfilerCallbacks()
{
return &m_ProfilerCallbacks;
}
THREADFUNC_DECL TaskScheduler::TaskingThreadFunction( void* pArgs )
{
ThreadArgs args = *(ThreadArgs*)pArgs;
uint32_t threadNum = args.threadNum;
TaskScheduler* pTS = args.pTaskScheduler;
gtl_threadNum = threadNum;
SafeCallback( pTS->m_ProfilerCallbacks.threadStart, threadNum );
uint32_t spinCount = 0;
uint32_t hintPipeToCheck_io = threadNum + 1; // does not need to be clamped.
while( pTS->m_bRunning )
{
if(!pTS->TryRunTask( threadNum, hintPipeToCheck_io ) )
{
// no tasks, will spin then wait
++spinCount;
if( spinCount > SPIN_COUNT )
{
pTS->WaitForTasks( threadNum );
spinCount = 0;
}
else
{
uint32_t spinBackoffCount = spinCount * SPIN_BACKOFF_MULTIPLIER;
while( spinBackoffCount )
{
Pause();
--spinBackoffCount;
}
}
}
else
{
spinCount = 0;
}
}
AtomicAdd( &pTS->m_NumThreadsRunning, -1 );
SafeCallback( pTS->m_ProfilerCallbacks.threadStop, threadNum );
return 0;
}
void TaskScheduler::StartThreads()
{
if( m_bHaveThreads )
{
return;
}
m_bRunning = true;
SemaphoreCreate( m_NewTaskSemaphore );
// we create one less thread than m_NumThreads as the main thread counts as one
m_pThreadNumStore = new ThreadArgs[m_NumThreads];
m_pThreadIDs = new threadid_t[m_NumThreads];
m_pThreadNumStore[0].threadNum = 0;
m_pThreadNumStore[0].pTaskScheduler = this;
m_pThreadIDs[0] = 0;
m_NumThreadsWaiting = 0;
m_NumThreadsRunning = 1;// acount for main thread
for( uint32_t thread = 1; thread < m_NumThreads; ++thread )
{
m_pThreadNumStore[thread].threadNum = thread;
m_pThreadNumStore[thread].pTaskScheduler = this;
ThreadCreate( &m_pThreadIDs[thread], TaskingThreadFunction, &m_pThreadNumStore[thread] );
++m_NumThreadsRunning;
}
// ensure we have sufficient tasks to equally fill either all threads including main
// or just the threads we've launched, this is outside the firstinit as we want to be able
// to runtime change it
if( 1 == m_NumThreads )
{
m_NumPartitions = 1;
m_NumInitialPartitions = 1;
}
else
{
m_NumPartitions = m_NumThreads * (m_NumThreads - 1);
m_NumInitialPartitions = m_NumThreads - 1;
if( m_NumInitialPartitions > MAX_NUM_INITIAL_PARTITIONS )
{
m_NumInitialPartitions = MAX_NUM_INITIAL_PARTITIONS;
}
}
m_bHaveThreads = true;
}
void TaskScheduler::StopThreads( bool bWait_ )
{
if( m_bHaveThreads )
{
// wait for them threads quit before deleting data
m_bRunning = false;
while( bWait_ && m_NumThreadsRunning > 1 )
{
// keep firing event to ensure all threads pick up state of m_bRunning
SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsRunning );
}
for( uint32_t thread = 1; thread < m_NumThreads; ++thread )
{
ThreadTerminate( m_pThreadIDs[thread] );
}
m_NumThreads = 0;
delete[] m_pThreadNumStore;
delete[] m_pThreadIDs;
m_pThreadNumStore = 0;
m_pThreadIDs = 0;
SemaphoreClose( m_NewTaskSemaphore );
m_bHaveThreads = false;
m_NumThreadsWaiting = 0;
m_NumThreadsRunning = 0;
}
}
bool TaskScheduler::TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ )
{
// check for tasks
SubTaskSet subTask;
bool bHaveTask = m_pPipesPerThread[ threadNum ].WriterTryReadFront( &subTask );
uint32_t threadToCheck = hintPipeToCheck_io_;
uint32_t checkCount = 0;
while( !bHaveTask && checkCount < m_NumThreads )
{
threadToCheck = ( hintPipeToCheck_io_ + checkCount ) % m_NumThreads;
if( threadToCheck != threadNum )
{
bHaveTask = m_pPipesPerThread[ threadToCheck ].ReaderTryReadBack( &subTask );
}
++checkCount;
}
if( bHaveTask )
{
// update hint, will preserve value unless actually got task from another thread.
hintPipeToCheck_io_ = threadToCheck;
uint32_t partitionSize = subTask.partition.end - subTask.partition.start;
if( subTask.pTask->m_RangeToRun < partitionSize )
{
SubTaskSet taskToRun = SplitTask( subTask, subTask.pTask->m_RangeToRun );
SplitAndAddTask( gtl_threadNum, subTask, subTask.pTask->m_RangeToRun, 0 );
taskToRun.pTask->ExecuteRange( taskToRun.partition, threadNum );
AtomicAdd( &taskToRun.pTask->m_RunningCount, -1 );
}
else
{
// the task has already been divided up by AddTaskSetToPipe, so just run it
subTask.pTask->ExecuteRange( subTask.partition, threadNum );
AtomicAdd( &subTask.pTask->m_RunningCount, -1 );
}
}
return bHaveTask;
}
void TaskScheduler::WaitForTasks( uint32_t threadNum )
{
// We incrememt the number of threads waiting here in order
// to ensure that the check for tasks occurs after the increment
// to prevent a task being added after a check, then the thread waiting.
// This will occasionally result in threads being mistakenly awoken,
// but they will then go back to sleep.
AtomicAdd( &m_NumThreadsWaiting, 1 );
bool bHaveTasks = false;
for( uint32_t thread = 0; thread < m_NumThreads; ++thread )
{
if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )
{
bHaveTasks = true;
break;
}
}
if( !bHaveTasks )
{
SafeCallback( m_ProfilerCallbacks.waitStart, threadNum );
SemaphoreWait( m_NewTaskSemaphore );
SafeCallback( m_ProfilerCallbacks.waitStop, threadNum );
}
int32_t prev = AtomicAdd( &m_NumThreadsWaiting, -1 );
assert( prev != 0 );
}
void TaskScheduler::WakeThreads()
{
SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsWaiting );
}
void TaskScheduler::SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_,
uint32_t rangeToSplit_, int32_t runningCountOffset_ )
{
int32_t numAdded = 0;
while( subTask_.partition.start != subTask_.partition.end )
{
SubTaskSet taskToAdd = SplitTask( subTask_, rangeToSplit_ );
// add the partition to the pipe
++numAdded;
if( !m_pPipesPerThread[ gtl_threadNum ].WriterTryWriteFront( taskToAdd ) )
{
if( numAdded > 1 )
{
WakeThreads();
}
// alter range to run the appropriate fraction
if( taskToAdd.pTask->m_RangeToRun < rangeToSplit_ )
{
taskToAdd.partition.end = taskToAdd.partition.start + taskToAdd.pTask->m_RangeToRun;
subTask_.partition.start = taskToAdd.partition.end;
}
taskToAdd.pTask->ExecuteRange( taskToAdd.partition, threadNum_ );
--numAdded;
}
}
// increment running count by number added
AtomicAdd( &subTask_.pTask->m_RunningCount, numAdded + runningCountOffset_ );
WakeThreads();
}
void TaskScheduler::AddTaskSetToPipe( ITaskSet* pTaskSet )
{
// set running count to -1 to guarantee it won't be found complete until all subtasks added
pTaskSet->m_RunningCount = -1;
// divide task up and add to pipe
pTaskSet->m_RangeToRun = pTaskSet->m_SetSize / m_NumPartitions;
if( pTaskSet->m_RangeToRun < pTaskSet->m_MinRange ) { pTaskSet->m_RangeToRun = pTaskSet->m_MinRange; }
uint32_t rangeToSplit = pTaskSet->m_SetSize / m_NumInitialPartitions;
if( rangeToSplit < pTaskSet->m_MinRange ) { rangeToSplit = pTaskSet->m_MinRange; }
SubTaskSet subTask;
subTask.pTask = pTaskSet;
subTask.partition.start = 0;
subTask.partition.end = pTaskSet->m_SetSize;
SplitAndAddTask( gtl_threadNum, subTask, rangeToSplit, 1 );
}
void TaskScheduler::WaitforTaskSet( const ITaskSet* pTaskSet )
{
uint32_t hintPipeToCheck_io = gtl_threadNum + 1; // does not need to be clamped.
if( pTaskSet )
{
while( pTaskSet->m_RunningCount )
{
TryRunTask( gtl_threadNum, hintPipeToCheck_io );
// should add a spin then wait for task completion event.
}
}
else
{
TryRunTask( gtl_threadNum, hintPipeToCheck_io );
}
}
void TaskScheduler::WaitforAll()
{
bool bHaveTasks = true;
uint32_t hintPipeToCheck_io = gtl_threadNum + 1; // does not need to be clamped.
int32_t threadsRunning = m_NumThreadsRunning - 1;
while( bHaveTasks || m_NumThreadsWaiting < threadsRunning )
{
TryRunTask( gtl_threadNum, hintPipeToCheck_io );
bHaveTasks = false;
for( uint32_t thread = 0; thread < m_NumThreads; ++thread )
{
if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )
{
bHaveTasks = true;
break;
}
}
}
}
void TaskScheduler::WaitforAllAndShutdown()
{
WaitforAll();
StopThreads(true);
delete[] m_pPipesPerThread;
m_pPipesPerThread = 0;
}
uint32_t TaskScheduler::GetNumTaskThreads() const
{
return m_NumThreads;
}
TaskScheduler::TaskScheduler()
: m_pPipesPerThread(NULL)
, m_NumThreads(0)
, m_pThreadNumStore(NULL)
, m_pThreadIDs(NULL)
, m_bRunning(false)
, m_NumThreadsRunning(0)
, m_NumThreadsWaiting(0)
, m_NumPartitions(0)
, m_bHaveThreads(false)
{
memset(&m_ProfilerCallbacks, 0, sizeof(m_ProfilerCallbacks));
}
TaskScheduler::~TaskScheduler()
{
StopThreads( true ); // Stops threads, waiting for them.
delete[] m_pPipesPerThread;
m_pPipesPerThread = 0;
}
void TaskScheduler::Initialize( uint32_t numThreads_ )
{
assert( numThreads_ );
StopThreads( true ); // Stops threads, waiting for them.
delete[] m_pPipesPerThread;
m_NumThreads = numThreads_;
m_pPipesPerThread = new TaskPipe[ m_NumThreads ];
StartThreads();
}
void TaskScheduler::Initialize()
{
Initialize( GetNumHardwareThreads() );
}

View File

@@ -0,0 +1,177 @@
// Copyright (c) 2013 Doug Binks
//
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgement in the product documentation would be
// appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
// misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source distribution.
#pragma once
#include <stdint.h>
#include "Threads.h"
namespace enki
{
struct TaskSetPartition
{
uint32_t start;
uint32_t end;
};
class TaskScheduler;
class TaskPipe;
struct ThreadArgs;
struct SubTaskSet;
// Subclass ITaskSet to create tasks.
// TaskSets can be re-used, but check
class ITaskSet
{
public:
ITaskSet()
: m_SetSize(1)
, m_MinRange(1)
, m_RunningCount(0)
, m_RangeToRun(1)
{}
ITaskSet( uint32_t setSize_ )
: m_SetSize( setSize_ )
, m_MinRange(1)
, m_RunningCount(0)
, m_RangeToRun(1)
{}
ITaskSet( uint32_t setSize_, uint32_t minRange_ )
: m_SetSize( setSize_ )
, m_MinRange( minRange_ )
, m_RunningCount(0)
, m_RangeToRun(minRange_)
{}
// Execute range should be overloaded to process tasks. It will be called with a
// range_ where range.start >= 0; range.start < range.end; and range.end < m_SetSize;
// The range values should be mapped so that linearly processing them in order is cache friendly
// i.e. neighbouring values should be close together.
// threadnum should not be used for changing processing of data, it's intended purpose
// is to allow per-thread data buckets for output.
virtual void ExecuteRange( TaskSetPartition range, uint32_t threadnum ) = 0;
// Size of set - usually the number of data items to be processed, see ExecuteRange. Defaults to 1
uint32_t m_SetSize;
// Minimum size of of TaskSetPartition range when splitting a task set into partitions.
// This should be set to a value which results in computation effort of at least 10k
// clock cycles to minimize tast scheduler overhead.
// NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple
// of m_MinRange.
// Also known as grain size in literature.
uint32_t m_MinRange;
bool GetIsComplete()
{
return 0 == m_RunningCount;
}
private:
friend class TaskScheduler;
volatile int32_t m_RunningCount;
uint32_t m_RangeToRun;
};
// TaskScheduler implements several callbacks intended for profilers
typedef void (*ProfilerCallbackFunc)( uint32_t threadnum_ );
struct ProfilerCallbacks
{
ProfilerCallbackFunc threadStart;
ProfilerCallbackFunc threadStop;
ProfilerCallbackFunc waitStart;
ProfilerCallbackFunc waitStop;
};
class TaskScheduler
{
public:
TaskScheduler();
~TaskScheduler();
// Call either Initialize() or Initialize( numThreads_ ) before adding tasks.
// Initialize() will create GetNumHardwareThreads()-1 threads, which is
// sufficient to fill the system when including the main thread.
// Initialize can be called multiple times - it will wait for completion
// before re-initializing.
void Initialize();
// Initialize( numThreads_ ) - numThreads_ (must be > 0)
// will create numThreads_-1 threads, as thread 0 is
// the thread on which the initialize was called.
void Initialize( uint32_t numThreads_ );
// Adds the TaskSet to pipe and returns if the pipe is not full.
// If the pipe is full, pTaskSet is run.
// should only be called from main thread, or within a task
void AddTaskSetToPipe( ITaskSet* pTaskSet );
// Runs the TaskSets in pipe until true == pTaskSet->GetIsComplete();
// should only be called from thread which created the taskscheduler , or within a task
// if called with 0 it will try to run tasks, and return if none available.
void WaitforTaskSet( const ITaskSet* pTaskSet );
// Waits for all task sets to complete - not guaranteed to work unless we know we
// are in a situation where tasks aren't being continuosly added.
void WaitforAll();
// Waits for all task sets to complete and shutdown threads - not guaranteed to work unless we know we
// are in a situation where tasks aren't being continuosly added.
void WaitforAllAndShutdown();
// Returns the number of threads created for running tasks + 1
// to account for the main thread.
uint32_t GetNumTaskThreads() const;
// Returns the ProfilerCallbacks structure so that it can be modified to
// set the callbacks.
ProfilerCallbacks* GetProfilerCallbacks();
private:
static THREADFUNC_DECL TaskingThreadFunction( void* pArgs );
void WaitForTasks( uint32_t threadNum );
bool TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ );
void StartThreads();
void StopThreads( bool bWait_ );
void SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_,
uint32_t rangeToSplit_, int32_t runningCountOffset_ );
void WakeThreads();
TaskPipe* m_pPipesPerThread;
uint32_t m_NumThreads;
ThreadArgs* m_pThreadNumStore;
threadid_t* m_pThreadIDs;
volatile bool m_bRunning;
volatile int32_t m_NumThreadsRunning;
volatile int32_t m_NumThreadsWaiting;
uint32_t m_NumPartitions;
uint32_t m_NumInitialPartitions;
semaphoreid_t m_NewTaskSemaphore;
bool m_bHaveThreads;
ProfilerCallbacks m_ProfilerCallbacks;
TaskScheduler( const TaskScheduler& nocopy );
TaskScheduler& operator=( const TaskScheduler& nocopy );
};
}

View File

@@ -0,0 +1,122 @@
// Copyright (c) 2013 Doug Binks
//
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgement in the product documentation would be
// appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
// misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source distribution.
#include "TaskScheduler_c.h"
#include "TaskScheduler.h"
#include <assert.h>
using namespace enki;
struct enkiTaskScheduler : TaskScheduler
{
};
struct enkiTaskSet : ITaskSet
{
enkiTaskSet( enkiTaskExecuteRange taskFun_ ) : taskFun(taskFun_), pArgs(NULL) {}
virtual void ExecuteRange( TaskSetPartition range, uint32_t threadnum )
{
taskFun( range.start, range.end, threadnum, pArgs );
}
enkiTaskExecuteRange taskFun;
void* pArgs;
};
enkiTaskScheduler* enkiNewTaskScheduler()
{
enkiTaskScheduler* pETS = new enkiTaskScheduler();
return pETS;
}
void enkiInitTaskScheduler( enkiTaskScheduler* pETS_ )
{
pETS_->Initialize();
}
void enkiInitTaskSchedulerNumThreads( enkiTaskScheduler* pETS_, uint32_t numThreads_ )
{
pETS_->Initialize( numThreads_ );
}
void enkiDeleteTaskScheduler( enkiTaskScheduler* pETS_ )
{
delete pETS_;
}
enkiTaskSet* enkiCreateTaskSet( enkiTaskScheduler* pETS_, enkiTaskExecuteRange taskFunc_ )
{
return new enkiTaskSet( taskFunc_ );
}
void enkiDeleteTaskSet( enkiTaskSet* pTaskSet_ )
{
delete pTaskSet_;
}
void enkiAddTaskSetToPipe( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_, void* pArgs_, uint32_t setSize_ )
{
assert( pTaskSet_ );
assert( pTaskSet_->taskFun );
pTaskSet_->m_SetSize = setSize_;
pTaskSet_->pArgs = pArgs_;
pETS_->AddTaskSetToPipe( pTaskSet_ );
}
void enkiAddTaskSetToPipeMinRange(enkiTaskScheduler * pETS_, enkiTaskSet * pTaskSet_, void * pArgs_, uint32_t setSize_, uint32_t minRange_)
{
assert( pTaskSet_ );
assert( pTaskSet_->taskFun );
pTaskSet_->m_SetSize = setSize_;
pTaskSet_->m_MinRange = minRange_;
pTaskSet_->pArgs = pArgs_;
pETS_->AddTaskSetToPipe( pTaskSet_ );
}
int enkiIsTaskSetComplete( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ )
{
assert( pTaskSet_ );
return ( pTaskSet_->GetIsComplete() ) ? 1 : 0;
}
void enkiWaitForTaskSet( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ )
{
pETS_->WaitforTaskSet( pTaskSet_ );
}
void enkiWaitForAll( enkiTaskScheduler* pETS_ )
{
pETS_->WaitforAll();
}
uint32_t enkiGetNumTaskThreads( enkiTaskScheduler* pETS_ )
{
return pETS_->GetNumTaskThreads();
}
enkiProfilerCallbacks* enkiGetProfilerCallbacks( enkiTaskScheduler* pETS_ )
{
assert( sizeof(enkiProfilerCallbacks) == sizeof(enki::ProfilerCallbacks) );
return (enkiProfilerCallbacks*)pETS_->GetProfilerCallbacks();
}

View File

@@ -0,0 +1,104 @@
// Copyright (c) 2013 Doug Binks
//
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgement in the product documentation would be
// appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
// misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source distribution.
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
typedef struct enkiTaskScheduler enkiTaskScheduler;
typedef struct enkiTaskSet enkiTaskSet;
typedef void (* enkiTaskExecuteRange)( uint32_t start_, uint32_t end, uint32_t threadnum_, void* pArgs_ );
// Create a new task scheduler
enkiTaskScheduler* enkiNewTaskScheduler();
// Initialize task scheduler - will create GetNumHardwareThreads()-1 threads, which is
// sufficient to fill the system when including the main thread.
// Initialize can be called multiple times - it will wait for completion
// before re-initializing.
void enkiInitTaskScheduler( enkiTaskScheduler* pETS_ );
// Initialize a task scheduler with numThreads_ (must be > 0)
// will create numThreads_-1 threads, as thread 0 is
// the thread on which the initialize was called.
void enkiInitTaskSchedulerNumThreads( enkiTaskScheduler* pETS_, uint32_t numThreads_ );
// Delete a task scheduler
void enkiDeleteTaskScheduler( enkiTaskScheduler* pETS_ );
// Create a task set.
enkiTaskSet* enkiCreateTaskSet( enkiTaskScheduler* pETS_, enkiTaskExecuteRange taskFunc_ );
// Delete a task set.
void enkiDeleteTaskSet( enkiTaskSet* pTaskSet_ );
// Schedule the task
void enkiAddTaskSetToPipe( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_,
void* pArgs_, uint32_t setSize_ );
// Schedule the task with a minimum range.
// This should be set to a value which results in computation effort of at least 10k
// clock cycles to minimize tast scheduler overhead.
// NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple
// of m_MinRange.
// Also known as grain size in literature.
void enkiAddTaskSetToPipeMinRange( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_,
void* pArgs_, uint32_t setSize_, uint32_t minRange_ );
// Check if TaskSet is complete. Doesn't wait. Returns 1 if complete, 0 if not.
int enkiIsTaskSetComplete( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ );
// Wait for a given task.
// should only be called from thread which created the taskscheduler , or within a task
// if called with 0 it will try to run tasks, and return if none available.
void enkiWaitForTaskSet( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ );
// Waits for all task sets to complete - not guaranteed to work unless we know we
// are in a situation where tasks aren't being continuosly added.
void enkiWaitForAll( enkiTaskScheduler* pETS_ );
// get number of threads
uint32_t enkiGetNumTaskThreads( enkiTaskScheduler* pETS_ );
// TaskScheduler implements several callbacks intended for profilers
typedef void (*enkiProfilerCallbackFunc)( uint32_t threadnum_ );
struct enkiProfilerCallbacks
{
enkiProfilerCallbackFunc threadStart;
enkiProfilerCallbackFunc threadStop;
enkiProfilerCallbackFunc waitStart;
enkiProfilerCallbackFunc waitStop;
};
// Get the callback structure so it can be set
struct enkiProfilerCallbacks* enkiGetProfilerCallbacks( enkiTaskScheduler* pETS_ );
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,210 @@
// Copyright (c) 2013 Doug Binks
//
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgement in the product documentation would be
// appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
// misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source distribution.
#pragma once
#include <stdint.h>
#include <assert.h>
#ifdef _WIN32
#include "Atomics.h"
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#define THREADFUNC_DECL DWORD WINAPI
#define THREAD_LOCAL __declspec( thread )
namespace enki
{
typedef HANDLE threadid_t;
// declare the thread start function as:
// THREADFUNC_DECL MyThreadStart( void* pArg );
inline bool ThreadCreate( threadid_t* returnid, DWORD ( WINAPI *StartFunc) (void* ), void* pArg )
{
// posix equiv pthread_create
DWORD threadid;
*returnid = CreateThread( 0, 0, StartFunc, pArg, 0, &threadid );
return *returnid != NULL;
}
inline bool ThreadTerminate( threadid_t threadid )
{
// posix equiv pthread_cancel
return CloseHandle( threadid ) == 0;
}
inline uint32_t GetNumHardwareThreads()
{
SYSTEM_INFO sysInfo;
GetSystemInfo(&sysInfo);
return sysInfo.dwNumberOfProcessors;
}
}
#else // posix
#include <pthread.h>
#include <unistd.h>
#define THREADFUNC_DECL void*
#define THREAD_LOCAL __thread
namespace enki
{
typedef pthread_t threadid_t;
// declare the thread start function as:
// THREADFUNC_DECL MyThreadStart( void* pArg );
inline bool ThreadCreate( threadid_t* returnid, void* ( *StartFunc) (void* ), void* pArg )
{
// posix equiv pthread_create
int32_t retval = pthread_create( returnid, NULL, StartFunc, pArg );
return retval == 0;
}
inline bool ThreadTerminate( threadid_t threadid )
{
// posix equiv pthread_cancel
return pthread_cancel( threadid ) == 0;
}
inline uint32_t GetNumHardwareThreads()
{
return (uint32_t)sysconf( _SC_NPROCESSORS_ONLN );
}
}
#endif // posix
// Semaphore implementation
#ifdef _WIN32
namespace enki
{
struct semaphoreid_t
{
HANDLE sem;
};
inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
{
semaphoreid.sem = CreateSemaphore(NULL, 0, MAXLONG, NULL );
}
inline void SemaphoreClose( semaphoreid_t& semaphoreid )
{
CloseHandle( semaphoreid.sem );
}
inline void SemaphoreWait( semaphoreid_t& semaphoreid )
{
DWORD retval = WaitForSingleObject( semaphoreid.sem, INFINITE );
assert( retval != WAIT_FAILED );
}
inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
{
if( countWaiting )
{
ReleaseSemaphore( semaphoreid.sem, countWaiting, NULL );
}
}
}
#elif defined(__MACH__)
// OS X does not have POSIX semaphores
// see https://developer.apple.com/library/content/documentation/Darwin/Conceptual/KernelProgramming/synchronization/synchronization.html
#include <mach/mach.h>
namespace enki
{
struct semaphoreid_t
{
semaphore_t sem;
};
inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
{
semaphore_create( mach_task_self(), &semaphoreid.sem, SYNC_POLICY_FIFO, 0 );
}
inline void SemaphoreClose( semaphoreid_t& semaphoreid )
{
semaphore_destroy( mach_task_self(), semaphoreid.sem );
}
inline void SemaphoreWait( semaphoreid_t& semaphoreid )
{
semaphore_wait( semaphoreid.sem );
}
inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
{
while( countWaiting-- > 0 )
{
semaphore_signal( semaphoreid.sem );
}
}
}
#else // POSIX
#include <semaphore.h>
namespace enki
{
struct semaphoreid_t
{
sem_t sem;
};
inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
{
int err = sem_init( &semaphoreid.sem, 0, 0 );
assert( err == 0 );
}
inline void SemaphoreClose( semaphoreid_t& semaphoreid )
{
sem_destroy( &semaphoreid.sem );
}
inline void SemaphoreWait( semaphoreid_t& semaphoreid )
{
int err = sem_wait( &semaphoreid.sem );
assert( err == 0 );
}
inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
{
while( countWaiting-- > 0 )
{
sem_post( &semaphoreid.sem );
}
}
}
#endif