initial commit, 4.5 stable
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled

This commit is contained in:
2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions

View File

@@ -0,0 +1,56 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include <functional>
#include "parallel_reduce.h"
namespace embree
{
template<typename Index, class UnaryPredicate>
__forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
{
std::atomic_bool ret;
ret = false;
#if defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION >= 12002
tbb::task_group_context context;
tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
if (context.is_group_execution_cancelled()) return;
for (size_t i = r.begin(); i != r.end(); ++i) {
if (pred(i)) {
ret = true;
context.cancel_group_execution();
}
}
});
#else
tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
if (tbb::task::self().is_cancelled()) return;
for (size_t i = r.begin(); i != r.end(); ++i) {
if (pred(i)) {
ret = true;
tbb::task::self().cancel_group_execution();
}
}
});
#endif
#else
ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
bool localret = false;
for (auto i=r.begin(); i<r.end(); ++i) {
localret |= pred(i);
}
return localret;
},
std::bit_or<bool>()
);
#endif
return ret;
}
} // end namespace

View File

@@ -0,0 +1,93 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_for.h"
namespace embree
{
template<typename Ty, typename Index, typename Predicate>
inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
{
Index j = first;
for (Index i=first; i<last; i++)
if (predicate(data[i]))
data[j++] = data[i];
return j;
}
template<typename Ty, typename Index, typename Predicate>
inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
{
/* sequential fallback */
if (end-begin <= minStepSize)
return sequential_filter(data,begin,end,predicate);
/* calculate number of tasks to use */
enum { MAX_TASKS = 64 };
const Index numThreads = TaskScheduler::threadCount();
const Index numBlocks = (end-begin+minStepSize-1)/minStepSize;
const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS);
/* filter blocks */
Index nused[MAX_TASKS];
Index nfree[MAX_TASKS];
parallel_for(taskCount, [&](const Index taskIndex)
{
const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
const Index i2 = sequential_filter(data,i0,i1,predicate);
nused[taskIndex] = i2-i0;
nfree[taskIndex] = i1-i2;
});
/* calculate offsets */
Index sused=0;
Index sfree=0;
Index pfree[MAX_TASKS];
for (Index i=0; i<taskCount; i++)
{
sused+=nused[i];
Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
}
/* return if we did not filter out any element */
assert(sfree <= end-begin);
assert(sused <= end-begin);
if (sused == end-begin)
return end;
/* otherwise we have to copy misplaced elements around */
parallel_for(taskCount, [&](const Index taskIndex)
{
/* destination to write elements to */
Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
Index dst_end = min(dst+nfree[taskIndex],begin+sused);
if (dst_end <= dst) return;
/* range of misplaced elements to copy to destination */
Index r0 = pfree[taskIndex];
Index r1 = r0+dst_end-dst;
/* find range in misplaced elements in back to front order */
Index k0=0;
for (Index i=taskCount-1; i>0; i--)
{
if (k0 > r1) break;
Index k1 = k0+nused[i];
Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
for (Index i=max(r0,k0); i<min(r1,k1); i++) {
Index isrc = src-i+k0-1;
assert(dst >= begin && dst < end);
assert(isrc >= begin && isrc < end);
data[dst++] = data[isrc];
}
k0 = k1;
}
});
return begin+sused;
}
}

View File

@@ -0,0 +1,161 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "../tasking/taskscheduler.h"
#include "../sys/array.h"
#include "../math/emath.h"
#include "../math/range.h"
namespace embree
{
/* parallel_for without range */
template<typename Index, typename Func>
__forceinline void parallel_for( const Index N, const Func& func)
{
#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
if (N) {
TaskScheduler::TaskGroupContext context;
TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
assert(r.size() == 1);
func(r.begin());
},&context);
TaskScheduler::wait();
if (context.cancellingException != nullptr) {
std::rethrow_exception(context.cancellingException);
}
}
#elif defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION >= 12002
tbb::task_group_context context;
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
},context);
if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled");
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
});
if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled");
#endif
#elif defined(TASKING_PPL)
concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
});
#else
# error "no tasking system enabled"
#endif
}
/* parallel for with range and granulatity */
template<typename Index, typename Func>
__forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
{
assert(first <= last);
#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
TaskScheduler::TaskGroupContext context;
TaskScheduler::spawn(first,last,minStepSize,func,&context);
TaskScheduler::wait();
if (context.cancellingException != nullptr) {
std::rethrow_exception(context.cancellingException);
}
#elif defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION >= 12002
tbb::task_group_context context;
tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
func(range<Index>(r.begin(),r.end()));
},context);
if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled");
#else
tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
func(range<Index>(r.begin(),r.end()));
});
if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled");
#endif
#elif defined(TASKING_PPL)
concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) {
func(range<Index>(i,i+1));
});
#else
# error "no tasking system enabled"
#endif
}
/* parallel for with range */
template<typename Index, typename Func>
__forceinline void parallel_for( const Index first, const Index last, const Func& func)
{
assert(first <= last);
parallel_for(first,last,(Index)1,func);
}
#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
template<typename Index, typename Func>
__forceinline void parallel_for_static( const Index N, const Func& func)
{
#if TBB_INTERFACE_VERSION >= 12002
tbb::task_group_context context;
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
},tbb::simple_partitioner(),context);
if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled");
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
},tbb::simple_partitioner());
if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled");
#endif
}
typedef tbb::affinity_partitioner affinity_partitioner;
template<typename Index, typename Func>
__forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
{
#if TBB_INTERFACE_VERSION >= 12002
tbb::task_group_context context;
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
},ap,context);
if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled");
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
},ap);
if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled");
#endif
}
#else
template<typename Index, typename Func>
__forceinline void parallel_for_static( const Index N, const Func& func)
{
parallel_for(N,func);
}
struct affinity_partitioner {
};
template<typename Index, typename Func>
__forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap)
{
parallel_for(N,func);
}
#endif
}

View File

@@ -0,0 +1,160 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_for.h"
namespace embree
{
template<typename ArrayArray, typename Func>
__forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
{
size_t k=0;
for (size_t i=0; i!=array2.size(); ++i) {
const size_t N = array2[i]->size();
if (N) func(array2[i],range<size_t>(0,N),k);
k+=N;
}
}
class ParallelForForState
{
public:
enum { MAX_TASKS = 64 };
__forceinline ParallelForForState ()
: taskCount(0) {}
template<typename ArrayArray>
__forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
init(array2,minStepSize);
}
template<typename SizeFunc>
__forceinline ParallelForForState (const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize) {
init(numArrays,getSize,minStepSize);
}
template<typename SizeFunc>
__forceinline void init ( const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize )
{
/* first calculate total number of elements */
size_t N = 0;
for (size_t i=0; i<numArrays; i++) {
N += getSize(i);
}
this->N = N;
/* calculate number of tasks to use */
const size_t numThreads = TaskScheduler::threadCount();
const size_t numBlocks = (N+minStepSize-1)/minStepSize;
taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
/* calculate start (i,j) for each task */
size_t taskIndex = 0;
i0[taskIndex] = 0;
j0[taskIndex] = 0;
size_t k0 = (++taskIndex)*N/taskCount;
for (size_t i=0, k=0; taskIndex < taskCount; i++)
{
assert(i<numArrays);
size_t j=0, M = getSize(i);
while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
assert(taskIndex<taskCount);
i0[taskIndex] = i;
j0[taskIndex] = j += k0-k;
k=k0;
k0 = (++taskIndex)*N/taskCount;
}
k+=M-j;
}
}
template<typename ArrayArray>
__forceinline void init ( ArrayArray& array2, const size_t minStepSize )
{
init(array2.size(),[&](size_t i) { return array2[i] ? array2[i]->size() : 0; },minStepSize);
}
__forceinline size_t size() const {
return N;
}
public:
size_t i0[MAX_TASKS];
size_t j0[MAX_TASKS];
size_t taskCount;
size_t N;
};
template<typename ArrayArray, typename Func>
__forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
{
ParallelForForState state(array2,minStepSize);
parallel_for(state.taskCount, [&](const size_t taskIndex)
{
/* calculate range */
const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
size_t i0 = state.i0[taskIndex];
size_t j0 = state.j0[taskIndex];
/* iterate over arrays */
size_t k=k0;
for (size_t i=i0; k<k1; i++) {
const size_t N = array2[i] ? array2[i]->size() : 0;
const size_t r0 = j0, r1 = min(N,r0+k1-k);
if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
k+=r1-r0; j0 = 0;
}
});
}
template<typename ArrayArray, typename Func>
__forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
{
parallel_for_for(array2,1,func);
}
template<typename ArrayArray, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
{
ParallelForForState state(array2,minStepSize);
Value temp[ParallelForForState::MAX_TASKS];
for (size_t i=0; i<state.taskCount; i++)
temp[i] = identity;
parallel_for(state.taskCount, [&](const size_t taskIndex)
{
/* calculate range */
const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
size_t i0 = state.i0[taskIndex];
size_t j0 = state.j0[taskIndex];
/* iterate over arrays */
size_t k=k0;
for (size_t i=i0; k<k1; i++) {
const size_t N = array2[i] ? array2[i]->size() : 0;
const size_t r0 = j0, r1 = min(N,r0+k1-k);
if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
k+=r1-r0; j0 = 0;
}
});
Value ret = identity;
for (size_t i=0; i<state.taskCount; i++)
ret = reduction(ret,temp[i]);
return ret;
}
template<typename ArrayArray, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
{
return parallel_for_for_reduce(array2,1,identity,func,reduction);
}
}

View File

@@ -0,0 +1,142 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_for_for.h"
#include "parallel_prefix_sum.h"
namespace embree
{
template<typename Value>
struct ParallelForForPrefixSumState : public ParallelForForState
{
__forceinline ParallelForForPrefixSumState () {}
template<typename ArrayArray>
__forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
: ParallelForForState(array2,minStepSize) {}
template<typename SizeFunc>
__forceinline ParallelForForPrefixSumState (size_t numArrays, const SizeFunc& getSize, const size_t minStepSize)
: ParallelForForState(numArrays,getSize,minStepSize) {}
ParallelPrefixSumState<Value> prefix_state;
};
template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_for_for_prefix_sum0_( ParallelForForPrefixSumState<Value>& state, Index minStepSize,
const SizeFunc& getSize, const Value& identity, const Func& func, const Reduction& reduction)
{
/* calculate number of tasks to use */
const size_t taskCount = state.taskCount;
/* perform parallel prefix sum */
parallel_for(taskCount, [&](const size_t taskIndex)
{
const size_t k0 = (taskIndex+0)*state.size()/taskCount;
const size_t k1 = (taskIndex+1)*state.size()/taskCount;
size_t i0 = state.i0[taskIndex];
size_t j0 = state.j0[taskIndex];
/* iterate over arrays */
size_t k=k0;
Value N=identity;
for (size_t i=i0; k<k1; i++) {
const size_t size = getSize(i);
const size_t r0 = j0, r1 = min(size,r0+k1-k);
if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k));
k+=r1-r0; j0 = 0;
}
state.prefix_state.counts[taskIndex] = N;
});
/* calculate prefix sum */
Value sum=identity;
for (size_t i=0; i<taskCount; i++)
{
const Value c = state.prefix_state.counts[i];
state.prefix_state.sums[i] = sum;
sum=reduction(sum,c);
}
return sum;
}
template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_for_for_prefix_sum1_( ParallelForForPrefixSumState<Value>& state, Index minStepSize,
const SizeFunc& getSize,
const Value& identity, const Func& func, const Reduction& reduction)
{
/* calculate number of tasks to use */
const size_t taskCount = state.taskCount;
/* perform parallel prefix sum */
parallel_for(taskCount, [&](const size_t taskIndex)
{
const size_t k0 = (taskIndex+0)*state.size()/taskCount;
const size_t k1 = (taskIndex+1)*state.size()/taskCount;
size_t i0 = state.i0[taskIndex];
size_t j0 = state.j0[taskIndex];
/* iterate over arrays */
size_t k=k0;
Value N=identity;
for (size_t i=i0; k<k1; i++) {
const size_t size = getSize(i);
const size_t r0 = j0, r1 = min(size,r0+k1-k);
if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k,reduction(state.prefix_state.sums[taskIndex],N)));
k+=r1-r0; j0 = 0;
}
state.prefix_state.counts[taskIndex] = N;
});
/* calculate prefix sum */
Value sum=identity;
for (size_t i=0; i<taskCount; i++)
{
const Value c = state.prefix_state.counts[i];
state.prefix_state.sums[i] = sum;
sum=reduction(sum,c);
}
return sum;
}
template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state,
ArrayArray& array2, Index minStepSize,
const Value& identity, const Func& func, const Reduction& reduction)
{
return parallel_for_for_prefix_sum0_(state,minStepSize,
[&](Index i) { return array2[i] ? array2[i]->size() : 0; },
identity,
[&](Index i, const range<Index>& r, Index k) { return func(array2[i], r, k, i); },
reduction);
}
template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state,
ArrayArray& array2, Index minStepSize,
const Value& identity, const Func& func, const Reduction& reduction)
{
return parallel_for_for_prefix_sum1_(state,minStepSize,
[&](Index i) { return array2[i] ? array2[i]->size() : 0; },
identity,
[&](Index i, const range<Index>& r, Index k, const Value& base) { return func(array2[i], r, k, i, base); },
reduction);
}
template<typename ArrayArray, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
const Value& identity, const Func& func, const Reduction& reduction)
{
return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
}
template<typename ArrayArray, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
const Value& identity, const Func& func, const Reduction& reduction)
{
return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
}
}

View File

@@ -0,0 +1,85 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_sort.h"
namespace embree
{
/*! implementation of a key/value map with parallel construction */
template<typename Key, typename Val>
class parallel_map
{
/* key/value pair to build the map */
struct KeyValue
{
__forceinline KeyValue () {}
__forceinline KeyValue (const Key key, const Val val)
: key(key), val(val) {}
__forceinline operator Key() const {
return key;
}
public:
Key key;
Val val;
};
public:
/*! parallel map constructors */
parallel_map () {}
/*! construction from pair of vectors */
template<typename KeyVector, typename ValVector>
parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
/*! initialized the parallel map from a vector with keys and values */
template<typename KeyVector, typename ValVector>
void init(const KeyVector& keys, const ValVector& values)
{
/* reserve sufficient space for all data */
assert(keys.size() == values.size());
vec.resize(keys.size());
/* generate key/value pairs */
parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
for (size_t i=r.begin(); i<r.end(); i++)
vec[i] = KeyValue((Key)keys[i],values[i]);
});
/* perform parallel radix sort of the key/value pairs */
std::vector<KeyValue> temp(keys.size());
radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
}
/*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
__forceinline const Val* lookup(const Key& key) const
{
typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
if (i == vec.end()) return nullptr;
if (i->key != key) return nullptr;
return &i->val;
}
/*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
__forceinline Val lookup(const Key& key, const Val& def) const
{
typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
if (i == vec.end()) return def;
if (i->key != key) return def;
return i->val;
}
/*! clears all state */
void clear() {
vec.clear();
}
private:
std::vector<KeyValue> vec; //!< vector containing sorted elements
};
}

View File

@@ -0,0 +1,283 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_for.h"
#include "../math/range.h"
namespace embree
{
/* serial partitioning */
template<typename T, typename V, typename IsLeft, typename Reduction_T>
__forceinline size_t serial_partitioning(T* array,
const size_t begin,
const size_t end,
V& leftReduction,
V& rightReduction,
const IsLeft& is_left,
const Reduction_T& reduction_t)
{
T* l = array + begin;
T* r = array + end - 1;
while(1)
{
/* *l < pivot */
while (likely(l <= r && is_left(*l) ))
{
//prefetchw(l+4); // FIXME: enable?
reduction_t(leftReduction,*l);
++l;
}
/* *r >= pivot) */
while (likely(l <= r && !is_left(*r)))
{
//prefetchw(r-4); FIXME: enable?
reduction_t(rightReduction,*r);
--r;
}
if (r<l) break;
reduction_t(leftReduction ,*r);
reduction_t(rightReduction,*l);
xchg(*l,*r);
l++; r--;
}
return l - array;
}
template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
class __aligned(64) parallel_partition_task
{
ALIGNED_CLASS_(64);
private:
static const size_t MAX_TASKS = 64;
T* array;
size_t N;
const IsLeft& is_left;
const Reduction_T& reduction_t;
const Reduction_V& reduction_v;
const Vi& identity;
size_t numTasks;
__aligned(64) size_t counter_start[MAX_TASKS+1];
__aligned(64) size_t counter_left[MAX_TASKS+1];
__aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];
__aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS];
__aligned(64) V leftReductions[MAX_TASKS];
__aligned(64) V rightReductions[MAX_TASKS];
public:
__forceinline parallel_partition_task(T* array,
const size_t N,
const Vi& identity,
const IsLeft& is_left,
const Reduction_T& reduction_t,
const Reduction_V& reduction_v,
const size_t BLOCK_SIZE)
: array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
__forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
{
size_t i = 0;
while(index >= (size_t)r[i].size())
{
assert(i < numRanges);
index -= (size_t)r[i].size();
i++;
}
return &r[i];
}
__forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
const size_t numRightMisplacedRanges,
const size_t startID,
const size_t endID)
{
size_t leftLocalIndex = startID;
size_t rightLocalIndex = startID;
const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
size_t l_left = l_range->size() - leftLocalIndex;
size_t r_left = r_range->size() - rightLocalIndex;
T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
size_t size = endID - startID;
size_t items = min(size,min(l_left,r_left));
while (size)
{
if (unlikely(l_left == 0))
{
l_range++;
l_left = l_range->size();
l = &array[l_range->begin()];
items = min(size,min(l_left,r_left));
}
if (unlikely(r_left == 0))
{
r_range++;
r_left = r_range->size();
r = &array[r_range->begin()];
items = min(size,min(l_left,r_left));
}
size -= items;
l_left -= items;
r_left -= items;
while(items) {
items--;
xchg(*l++,*r++);
}
}
}
__forceinline size_t partition(V& leftReduction, V& rightReduction)
{
/* partition the individual ranges for each task */
parallel_for(numTasks,[&] (const size_t taskID) {
const size_t startID = (taskID+0)*N/numTasks;
const size_t endID = (taskID+1)*N/numTasks;
V local_left(identity);
V local_right(identity);
const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
counter_start[taskID] = startID;
counter_left [taskID] = mid-startID;
leftReductions[taskID] = local_left;
rightReductions[taskID] = local_right;
});
counter_start[numTasks] = N;
counter_left[numTasks] = 0;
/* finalize the reductions */
for (size_t i=0; i<numTasks; i++) {
reduction_v(leftReduction,leftReductions[i]);
reduction_v(rightReduction,rightReductions[i]);
}
/* calculate mid point for partitioning */
size_t mid = counter_left[0];
for (size_t i=1; i<numTasks; i++)
mid += counter_left[i];
const range<ssize_t> globalLeft (0,mid);
const range<ssize_t> globalRight(mid,N);
/* calculate all left and right ranges that are on the wrong global side */
size_t numMisplacedRangesLeft = 0;
size_t numMisplacedRangesRight = 0;
size_t numMisplacedItemsLeft MAYBE_UNUSED = 0;
size_t numMisplacedItemsRight MAYBE_UNUSED = 0;
for (size_t i=0; i<numTasks; i++)
{
const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
const range<ssize_t> left_misplaced = globalLeft. intersect(right_range);
const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
if (!left_misplaced.empty())
{
numMisplacedItemsLeft += left_misplaced.size();
leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
}
if (!right_misplaced.empty())
{
numMisplacedItemsRight += right_misplaced.size();
rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
}
}
assert( numMisplacedItemsLeft == numMisplacedItemsRight );
/* if no items are misplaced we are done */
if (numMisplacedItemsLeft == 0)
return mid;
/* otherwise we copy the items to the right place in parallel */
parallel_for(numTasks,[&] (const size_t taskID) {
const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks;
swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);
});
return mid;
}
};
template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
__noinline size_t parallel_partitioning(T* array,
const size_t begin,
const size_t end,
const Vi &identity,
V &leftReduction,
V &rightReduction,
const IsLeft& is_left,
const Reduction_T& reduction_t,
const Reduction_V& reduction_v,
size_t BLOCK_SIZE = 128)
{
/* fall back to single threaded partitioning for small N */
if (unlikely(end-begin < BLOCK_SIZE))
return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
/* otherwise use parallel code */
else {
typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
return begin+p->partition(leftReduction,rightReduction);
}
}
template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
__noinline size_t parallel_partitioning(T* array,
const size_t begin,
const size_t end,
const Vi &identity,
V &leftReduction,
V &rightReduction,
const IsLeft& is_left,
const Reduction_T& reduction_t,
const Reduction_V& reduction_v,
size_t BLOCK_SIZE,
size_t PARALLEL_THRESHOLD)
{
/* fall back to single threaded partitioning for small N */
if (unlikely(end-begin < PARALLEL_THRESHOLD))
return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
/* otherwise use parallel code */
else {
typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
return begin+p->partition(leftReduction,rightReduction);
}
}
template<typename T, typename IsLeft>
inline size_t parallel_partitioning(T* array,
const size_t begin,
const size_t end,
const IsLeft& is_left,
size_t BLOCK_SIZE = 128)
{
size_t leftReduction = 0;
size_t rightReduction = 0;
return parallel_partitioning(
array,begin,end,0,leftReduction,rightReduction,is_left,
[] (size_t& t,const T& ref) { },
[] (size_t& t0,size_t& t1) { },
BLOCK_SIZE);
}
}

View File

@@ -0,0 +1,85 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_for.h"
namespace embree
{
template<typename Value>
struct ParallelPrefixSumState
{
enum { MAX_TASKS = 64 };
Value counts[MAX_TASKS];
Value sums [MAX_TASKS];
};
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
{
/* calculate number of tasks to use */
const size_t numThreads = TaskScheduler::threadCount();
const size_t numBlocks = (last-first+minStepSize-1)/minStepSize;
const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
/* perform parallel prefix sum */
parallel_for(taskCount, [&](const size_t taskIndex)
{
const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
});
/* calculate prefix sum */
Value sum=identity;
for (size_t i=0; i<taskCount; i++)
{
const Value c = state.counts[i];
state.sums[i] = sum;
sum=reduction(sum,c);
}
return sum;
}
/*! parallel calculation of prefix sums */
template<typename SrcArray, typename DstArray, typename Value, typename Add>
__forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)
{
/* perform single threaded prefix operation for small N */
if (N < SINGLE_THREAD_THRESHOLD)
{
Value sum=identity;
for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
return sum;
}
/* perform parallel prefix operation for large N */
else
{
ParallelPrefixSumState<Value> state;
/* initial run just sets up start values for subtasks */
parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
Value s = identity;
for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
return s;
}, add);
/* final run calculates prefix sum */
return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
Value s = identity;
for (size_t i=r.begin(); i<r.end(); i++) {
dst[i] = add(sum,s);
s = add(s,src[i]);
}
return s;
}, add);
}
}
}

View File

@@ -0,0 +1,146 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_for.h"
namespace embree
{
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
{
return func(range<Index>(first,last));
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
{
return func(range<Index>(first,last));
}
template<typename Index, typename Value, typename Func, typename Reduction>
__noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
{
const Index maxTasks = 512;
const Index threadCount = (Index) TaskScheduler::threadCount();
taskCount = min(taskCount,threadCount,maxTasks);
/* parallel invocation of all tasks */
dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
parallel_for(taskCount, [&](const Index taskIndex) {
const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
values[taskIndex] = func(range<Index>(k0,k1));
});
/* perform reduction over all tasks */
Value v = identity;
for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
return v;
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
{
#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
/* fast path for small number of iterations */
Index taskCount = (last-first+minStepSize-1)/minStepSize;
if (likely(taskCount == 1)) {
return func(range<Index>(first,last));
}
return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
#elif defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION >= 12002
tbb::task_group_context context;
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction,context);
//if (context.is_group_execution_cancelled())
// throw std::runtime_error("task cancelled");
return v;
#else
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction);
//if (tbb::task::self().is_cancelled())
// throw std::runtime_error("task cancelled");
return v;
#endif
#else // TASKING_PPL
struct AlignedValue
{
char storage[__alignof(Value)+sizeof(Value)];
static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
operator Value() const { return *getValuePtr(); }
};
struct Iterator_Index
{
Index v;
typedef std::forward_iterator_tag iterator_category;
typedef AlignedValue value_type;
typedef Index difference_type;
typedef Index distance_type;
typedef AlignedValue* pointer;
typedef AlignedValue& reference;
__forceinline Iterator_Index() {}
__forceinline Iterator_Index(Index v) : v(v) {}
__forceinline bool operator== (Iterator_Index other) { return v == other.v; }
__forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
__forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
__forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
};
auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
assert(begin.v < end.v);
return reduction(start, func(range<Index>(begin.v, end.v)));
};
const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
return v;
#endif
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
{
if (likely(last-first < parallel_threshold)) {
return func(range<Index>(first,last));
} else {
return parallel_reduce(first,last,minStepSize,identity,func,reduction);
}
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
{
return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
{
auto funcr = [&] ( const range<Index> r ) {
Value v = identity;
for (Index i=r.begin(); i<r.end(); i++)
v = reduction(v,func(i));
return v;
};
return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
{
return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
}
}

View File

@@ -0,0 +1,52 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_sort.h"
namespace embree
{
/* implementation of a set of values with parallel construction */
template<typename T>
class parallel_set
{
public:
/*! default constructor for the parallel set */
parallel_set () {}
/*! construction from vector */
template<typename Vector>
parallel_set (const Vector& in) { init(in); }
/*! initialized the parallel set from a vector */
template<typename Vector>
void init(const Vector& in)
{
/* copy data to internal vector */
vec.resize(in.size());
parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
for (size_t i=r.begin(); i<r.end(); i++)
vec[i] = in[i];
});
/* sort the data */
std::vector<T> temp(in.size());
radix_sort<T>(vec.data(),temp.data(),vec.size());
}
/*! tests if some element is in the set */
__forceinline bool lookup(const T& elt) const {
return std::binary_search(vec.begin(), vec.end(), elt);
}
/*! clears all state */
void clear() {
vec.clear();
}
private:
std::vector<T> vec; //!< vector containing sorted elements
};
}

View File

@@ -0,0 +1,454 @@
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "../simd/simd.h"
#include "parallel_for.h"
#include <algorithm>
namespace embree
{
template<class T>
__forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
{
for(size_t i = 1;i<length;++i)
{
T v = array[i];
size_t j = i;
while(j > 0 && v < array[j-1])
{
array[j] = array[j-1];
--j;
}
array[j] = v;
}
}
template<class T>
__forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
{
for(size_t i = 1;i<length;++i)
{
T v = array[i];
size_t j = i;
while(j > 0 && v > array[j-1])
{
array[j] = array[j-1];
--j;
}
array[j] = v;
}
}
template<class T>
void quicksort_ascending(T *__restrict__ t,
const ssize_t begin,
const ssize_t end)
{
if (likely(begin < end))
{
const T pivotvalue = t[begin];
ssize_t left = begin - 1;
ssize_t right = end + 1;
while(1)
{
while (t[--right] > pivotvalue);
while (t[++left] < pivotvalue);
if (left >= right) break;
const T temp = t[right];
t[right] = t[left];
t[left] = temp;
}
const int pivot = right;
quicksort_ascending(t, begin, pivot);
quicksort_ascending(t, pivot + 1, end);
}
}
template<class T>
void quicksort_decending(T *__restrict__ t,
const ssize_t begin,
const ssize_t end)
{
if (likely(begin < end))
{
const T pivotvalue = t[begin];
ssize_t left = begin - 1;
ssize_t right = end + 1;
while(1)
{
while (t[--right] < pivotvalue);
while (t[++left] > pivotvalue);
if (left >= right) break;
const T temp = t[right];
t[right] = t[left];
t[left] = temp;
}
const int pivot = right;
quicksort_decending(t, begin, pivot);
quicksort_decending(t, pivot + 1, end);
}
}
template<class T, ssize_t THRESHOLD>
void quicksort_insertionsort_ascending(T *__restrict__ t,
const ssize_t begin,
const ssize_t end)
{
if (likely(begin < end))
{
const ssize_t size = end-begin+1;
if (likely(size <= THRESHOLD))
{
insertionsort_ascending<T>(&t[begin],size);
}
else
{
const T pivotvalue = t[begin];
ssize_t left = begin - 1;
ssize_t right = end + 1;
while(1)
{
while (t[--right] > pivotvalue);
while (t[++left] < pivotvalue);
if (left >= right) break;
const T temp = t[right];
t[right] = t[left];
t[left] = temp;
}
const ssize_t pivot = right;
quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
}
}
}
template<class T, ssize_t THRESHOLD>
void quicksort_insertionsort_decending(T *__restrict__ t,
const ssize_t begin,
const ssize_t end)
{
if (likely(begin < end))
{
const ssize_t size = end-begin+1;
if (likely(size <= THRESHOLD))
{
insertionsort_decending<T>(&t[begin],size);
}
else
{
const T pivotvalue = t[begin];
ssize_t left = begin - 1;
ssize_t right = end + 1;
while(1)
{
while (t[--right] < pivotvalue);
while (t[++left] > pivotvalue);
if (left >= right) break;
const T temp = t[right];
t[right] = t[left];
t[left] = temp;
}
const ssize_t pivot = right;
quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
}
}
}
template<typename T>
static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
{
static const unsigned int BITS = 8;
static const unsigned int BUCKETS = (1 << BITS);
static const unsigned int CMP_SORT_THRESHOLD = 16;
__aligned(64) unsigned int count[BUCKETS];
/* clear buckets */
for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
/* count buckets */
#if defined(__INTEL_COMPILER)
#pragma nounroll
#endif
for (size_t i=0;i<num;i++)
count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
/* prefix sums */
__aligned(64) unsigned int head[BUCKETS];
__aligned(64) unsigned int tail[BUCKETS];
head[0] = 0;
for (size_t i=1; i<BUCKETS; i++)
head[i] = head[i-1] + count[i-1];
for (size_t i=0; i<BUCKETS-1; i++)
tail[i] = head[i+1];
tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);
assert(tail[BUCKETS-1] == num);
/* in-place swap */
for (size_t i=0;i<BUCKETS;i++)
{
/* process bucket */
while(head[i] < tail[i])
{
T v = morton[head[i]];
while(1)
{
const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
if (b == i) break;
std::swap(v,morton[head[b]++]);
}
assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
morton[head[i]++] = v;
}
}
if (shift == 0) return;
size_t offset = 0;
for (size_t i=0;i<BUCKETS;i++)
if (count[i])
{
for (size_t j=offset;j<offset+count[i]-1;j++)
assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
if (unlikely(count[i] < CMP_SORT_THRESHOLD))
insertionsort_ascending(morton + offset, count[i]);
else
radixsort32(morton + offset, count[i], shift-BITS);
for (size_t j=offset;j<offset+count[i]-1;j++)
assert(morton[j] <= morton[j+1]);
offset += count[i];
}
}
template<typename Ty, typename Key>
class ParallelRadixSort
{
static const size_t MAX_TASKS = 64;
static const size_t BITS = 8;
static const size_t BUCKETS = (1 << BITS);
typedef unsigned int TyRadixCount[BUCKETS];
template<typename T>
static bool compare(const T& v0, const T& v1) {
return (Key)v0 < (Key)v1;
}
private:
ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
public:
ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
: radixCount(nullptr), src(src), tmp(tmp), N(N) {}
void sort(const size_t blockSize)
{
assert(blockSize > 0);
/* perform single threaded sort for small N */
if (N<=blockSize) // handles also special case of 0!
{
/* do inplace sort inside destination array */
std::sort(src,src+N,compare<Ty>);
}
/* perform parallel sort for large N */
else
{
const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
tbbRadixSort(numThreads);
}
}
~ParallelRadixSort()
{
alignedFree(radixCount);
radixCount = nullptr;
}
private:
void tbbRadixIteration0(const Key shift,
const Ty* __restrict const src,
Ty* __restrict const dst,
const size_t threadIndex, const size_t threadCount)
{
const size_t startID = (threadIndex+0)*N/threadCount;
const size_t endID = (threadIndex+1)*N/threadCount;
/* mask to extract some number of bits */
const Key mask = BUCKETS-1;
/* count how many items go into the buckets */
for (size_t i=0; i<BUCKETS; i++)
radixCount[threadIndex][i] = 0;
/* iterate over src array and count buckets */
unsigned int * __restrict const count = radixCount[threadIndex];
#if defined(__INTEL_COMPILER)
#pragma nounroll
#endif
for (size_t i=startID; i<endID; i++) {
#if defined(__64BIT__)
const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
#else
const Key index = ((Key)src[i] >> shift) & mask;
#endif
count[index]++;
}
}
void tbbRadixIteration1(const Key shift,
const Ty* __restrict const src,
Ty* __restrict const dst,
const size_t threadIndex, const size_t threadCount)
{
const size_t startID = (threadIndex+0)*N/threadCount;
const size_t endID = (threadIndex+1)*N/threadCount;
/* mask to extract some number of bits */
const Key mask = BUCKETS-1;
/* calculate total number of items for each bucket */
__aligned(64) unsigned int total[BUCKETS];
/*
for (size_t i=0; i<BUCKETS; i++)
total[i] = 0;
*/
for (size_t i=0; i<BUCKETS; i+=VSIZEX)
vintx::store(&total[i], zero);
for (size_t i=0; i<threadCount; i++)
{
/*
for (size_t j=0; j<BUCKETS; j++)
total[j] += radixCount[i][j];
*/
for (size_t j=0; j<BUCKETS; j+=VSIZEX)
vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
}
/* calculate start offset of each bucket */
__aligned(64) unsigned int offset[BUCKETS];
offset[0] = 0;
for (size_t i=1; i<BUCKETS; i++)
offset[i] = offset[i-1] + total[i-1];
/* calculate start offset of each bucket for this thread */
for (size_t i=0; i<threadIndex; i++)
{
/*
for (size_t j=0; j<BUCKETS; j++)
offset[j] += radixCount[i][j];
*/
for (size_t j=0; j<BUCKETS; j+=VSIZEX)
vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
}
/* copy items into their buckets */
#if defined(__INTEL_COMPILER)
#pragma nounroll
#endif
for (size_t i=startID; i<endID; i++) {
const Ty elt = src[i];
#if defined(__64BIT__)
const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
#else
const size_t index = ((Key)src[i] >> shift) & mask;
#endif
dst[offset[index]++] = elt;
}
}
void tbbRadixIteration(const Key shift, const bool last,
const Ty* __restrict src, Ty* __restrict dst,
const size_t numTasks)
{
affinity_partitioner ap;
parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
}
void tbbRadixSort(const size_t numTasks)
{
radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
if (sizeof(Key) == sizeof(uint32_t)) {
tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
}
else if (sizeof(Key) == sizeof(uint64_t))
{
tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
}
}
private:
TyRadixCount* radixCount;
Ty* const src;
Ty* const tmp;
const size_t N;
};
template<typename Ty>
void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
{
ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
}
template<typename Ty, typename Key>
void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
{
ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
}
template<typename Ty>
void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
}
template<typename Ty>
void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
}
}