initial commit, 4.5 stable
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled
This commit is contained in:
56
thirdparty/embree/common/algorithms/parallel_any_of.h
vendored
Normal file
56
thirdparty/embree/common/algorithms/parallel_any_of.h
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include "parallel_reduce.h"
|
||||
|
||||
namespace embree
|
||||
{
|
||||
|
||||
template<typename Index, class UnaryPredicate>
|
||||
__forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
|
||||
{
|
||||
std::atomic_bool ret;
|
||||
ret = false;
|
||||
|
||||
#if defined(TASKING_TBB)
|
||||
#if TBB_INTERFACE_VERSION >= 12002
|
||||
tbb::task_group_context context;
|
||||
tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
|
||||
if (context.is_group_execution_cancelled()) return;
|
||||
for (size_t i = r.begin(); i != r.end(); ++i) {
|
||||
if (pred(i)) {
|
||||
ret = true;
|
||||
context.cancel_group_execution();
|
||||
}
|
||||
}
|
||||
});
|
||||
#else
|
||||
tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
|
||||
if (tbb::task::self().is_cancelled()) return;
|
||||
for (size_t i = r.begin(); i != r.end(); ++i) {
|
||||
if (pred(i)) {
|
||||
ret = true;
|
||||
tbb::task::self().cancel_group_execution();
|
||||
}
|
||||
}
|
||||
});
|
||||
#endif
|
||||
#else
|
||||
ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
|
||||
bool localret = false;
|
||||
for (auto i=r.begin(); i<r.end(); ++i) {
|
||||
localret |= pred(i);
|
||||
}
|
||||
return localret;
|
||||
},
|
||||
std::bit_or<bool>()
|
||||
);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // end namespace
|
93
thirdparty/embree/common/algorithms/parallel_filter.h
vendored
Normal file
93
thirdparty/embree/common/algorithms/parallel_filter.h
vendored
Normal file
@@ -0,0 +1,93 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parallel_for.h"
|
||||
|
||||
namespace embree
|
||||
{
|
||||
template<typename Ty, typename Index, typename Predicate>
|
||||
inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
|
||||
{
|
||||
Index j = first;
|
||||
for (Index i=first; i<last; i++)
|
||||
if (predicate(data[i]))
|
||||
data[j++] = data[i];
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
template<typename Ty, typename Index, typename Predicate>
|
||||
inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
|
||||
{
|
||||
/* sequential fallback */
|
||||
if (end-begin <= minStepSize)
|
||||
return sequential_filter(data,begin,end,predicate);
|
||||
|
||||
/* calculate number of tasks to use */
|
||||
enum { MAX_TASKS = 64 };
|
||||
const Index numThreads = TaskScheduler::threadCount();
|
||||
const Index numBlocks = (end-begin+minStepSize-1)/minStepSize;
|
||||
const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS);
|
||||
|
||||
/* filter blocks */
|
||||
Index nused[MAX_TASKS];
|
||||
Index nfree[MAX_TASKS];
|
||||
parallel_for(taskCount, [&](const Index taskIndex)
|
||||
{
|
||||
const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
|
||||
const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
|
||||
const Index i2 = sequential_filter(data,i0,i1,predicate);
|
||||
nused[taskIndex] = i2-i0;
|
||||
nfree[taskIndex] = i1-i2;
|
||||
});
|
||||
|
||||
/* calculate offsets */
|
||||
Index sused=0;
|
||||
Index sfree=0;
|
||||
Index pfree[MAX_TASKS];
|
||||
for (Index i=0; i<taskCount; i++)
|
||||
{
|
||||
sused+=nused[i];
|
||||
Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
|
||||
}
|
||||
|
||||
/* return if we did not filter out any element */
|
||||
assert(sfree <= end-begin);
|
||||
assert(sused <= end-begin);
|
||||
if (sused == end-begin)
|
||||
return end;
|
||||
|
||||
/* otherwise we have to copy misplaced elements around */
|
||||
parallel_for(taskCount, [&](const Index taskIndex)
|
||||
{
|
||||
/* destination to write elements to */
|
||||
Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
|
||||
Index dst_end = min(dst+nfree[taskIndex],begin+sused);
|
||||
if (dst_end <= dst) return;
|
||||
|
||||
/* range of misplaced elements to copy to destination */
|
||||
Index r0 = pfree[taskIndex];
|
||||
Index r1 = r0+dst_end-dst;
|
||||
|
||||
/* find range in misplaced elements in back to front order */
|
||||
Index k0=0;
|
||||
for (Index i=taskCount-1; i>0; i--)
|
||||
{
|
||||
if (k0 > r1) break;
|
||||
Index k1 = k0+nused[i];
|
||||
Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
|
||||
for (Index i=max(r0,k0); i<min(r1,k1); i++) {
|
||||
Index isrc = src-i+k0-1;
|
||||
assert(dst >= begin && dst < end);
|
||||
assert(isrc >= begin && isrc < end);
|
||||
data[dst++] = data[isrc];
|
||||
}
|
||||
k0 = k1;
|
||||
}
|
||||
});
|
||||
|
||||
return begin+sused;
|
||||
}
|
||||
}
|
161
thirdparty/embree/common/algorithms/parallel_for.h
vendored
Normal file
161
thirdparty/embree/common/algorithms/parallel_for.h
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../tasking/taskscheduler.h"
|
||||
#include "../sys/array.h"
|
||||
#include "../math/emath.h"
|
||||
#include "../math/range.h"
|
||||
|
||||
namespace embree
|
||||
{
|
||||
/* parallel_for without range */
|
||||
template<typename Index, typename Func>
|
||||
__forceinline void parallel_for( const Index N, const Func& func)
|
||||
{
|
||||
#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
|
||||
if (N) {
|
||||
TaskScheduler::TaskGroupContext context;
|
||||
TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
|
||||
assert(r.size() == 1);
|
||||
func(r.begin());
|
||||
},&context);
|
||||
TaskScheduler::wait();
|
||||
if (context.cancellingException != nullptr) {
|
||||
std::rethrow_exception(context.cancellingException);
|
||||
}
|
||||
}
|
||||
#elif defined(TASKING_TBB)
|
||||
#if TBB_INTERFACE_VERSION >= 12002
|
||||
tbb::task_group_context context;
|
||||
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
|
||||
func(i);
|
||||
},context);
|
||||
if (context.is_group_execution_cancelled())
|
||||
throw std::runtime_error("task cancelled");
|
||||
#else
|
||||
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
|
||||
func(i);
|
||||
});
|
||||
if (tbb::task::self().is_cancelled())
|
||||
throw std::runtime_error("task cancelled");
|
||||
#endif
|
||||
|
||||
#elif defined(TASKING_PPL)
|
||||
concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) {
|
||||
func(i);
|
||||
});
|
||||
#else
|
||||
# error "no tasking system enabled"
|
||||
#endif
|
||||
}
|
||||
|
||||
/* parallel for with range and granulatity */
|
||||
template<typename Index, typename Func>
|
||||
__forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
|
||||
{
|
||||
assert(first <= last);
|
||||
#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
|
||||
TaskScheduler::TaskGroupContext context;
|
||||
TaskScheduler::spawn(first,last,minStepSize,func,&context);
|
||||
TaskScheduler::wait();
|
||||
if (context.cancellingException != nullptr) {
|
||||
std::rethrow_exception(context.cancellingException);
|
||||
}
|
||||
|
||||
#elif defined(TASKING_TBB)
|
||||
#if TBB_INTERFACE_VERSION >= 12002
|
||||
tbb::task_group_context context;
|
||||
tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
|
||||
func(range<Index>(r.begin(),r.end()));
|
||||
},context);
|
||||
if (context.is_group_execution_cancelled())
|
||||
throw std::runtime_error("task cancelled");
|
||||
#else
|
||||
tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
|
||||
func(range<Index>(r.begin(),r.end()));
|
||||
});
|
||||
if (tbb::task::self().is_cancelled())
|
||||
throw std::runtime_error("task cancelled");
|
||||
#endif
|
||||
|
||||
#elif defined(TASKING_PPL)
|
||||
concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) {
|
||||
func(range<Index>(i,i+1));
|
||||
});
|
||||
|
||||
#else
|
||||
# error "no tasking system enabled"
|
||||
#endif
|
||||
}
|
||||
|
||||
/* parallel for with range */
|
||||
template<typename Index, typename Func>
|
||||
__forceinline void parallel_for( const Index first, const Index last, const Func& func)
|
||||
{
|
||||
assert(first <= last);
|
||||
parallel_for(first,last,(Index)1,func);
|
||||
}
|
||||
|
||||
#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
|
||||
|
||||
template<typename Index, typename Func>
|
||||
__forceinline void parallel_for_static( const Index N, const Func& func)
|
||||
{
|
||||
#if TBB_INTERFACE_VERSION >= 12002
|
||||
tbb::task_group_context context;
|
||||
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
|
||||
func(i);
|
||||
},tbb::simple_partitioner(),context);
|
||||
if (context.is_group_execution_cancelled())
|
||||
throw std::runtime_error("task cancelled");
|
||||
#else
|
||||
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
|
||||
func(i);
|
||||
},tbb::simple_partitioner());
|
||||
if (tbb::task::self().is_cancelled())
|
||||
throw std::runtime_error("task cancelled");
|
||||
#endif
|
||||
}
|
||||
|
||||
typedef tbb::affinity_partitioner affinity_partitioner;
|
||||
|
||||
template<typename Index, typename Func>
|
||||
__forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
|
||||
{
|
||||
#if TBB_INTERFACE_VERSION >= 12002
|
||||
tbb::task_group_context context;
|
||||
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
|
||||
func(i);
|
||||
},ap,context);
|
||||
if (context.is_group_execution_cancelled())
|
||||
throw std::runtime_error("task cancelled");
|
||||
#else
|
||||
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
|
||||
func(i);
|
||||
},ap);
|
||||
if (tbb::task::self().is_cancelled())
|
||||
throw std::runtime_error("task cancelled");
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template<typename Index, typename Func>
|
||||
__forceinline void parallel_for_static( const Index N, const Func& func)
|
||||
{
|
||||
parallel_for(N,func);
|
||||
}
|
||||
|
||||
struct affinity_partitioner {
|
||||
};
|
||||
|
||||
template<typename Index, typename Func>
|
||||
__forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap)
|
||||
{
|
||||
parallel_for(N,func);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
160
thirdparty/embree/common/algorithms/parallel_for_for.h
vendored
Normal file
160
thirdparty/embree/common/algorithms/parallel_for_for.h
vendored
Normal file
@@ -0,0 +1,160 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parallel_for.h"
|
||||
|
||||
namespace embree
|
||||
{
|
||||
template<typename ArrayArray, typename Func>
|
||||
__forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
|
||||
{
|
||||
size_t k=0;
|
||||
for (size_t i=0; i!=array2.size(); ++i) {
|
||||
const size_t N = array2[i]->size();
|
||||
if (N) func(array2[i],range<size_t>(0,N),k);
|
||||
k+=N;
|
||||
}
|
||||
}
|
||||
|
||||
class ParallelForForState
|
||||
{
|
||||
public:
|
||||
|
||||
enum { MAX_TASKS = 64 };
|
||||
|
||||
__forceinline ParallelForForState ()
|
||||
: taskCount(0) {}
|
||||
|
||||
template<typename ArrayArray>
|
||||
__forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
|
||||
init(array2,minStepSize);
|
||||
}
|
||||
|
||||
template<typename SizeFunc>
|
||||
__forceinline ParallelForForState (const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize) {
|
||||
init(numArrays,getSize,minStepSize);
|
||||
}
|
||||
|
||||
template<typename SizeFunc>
|
||||
__forceinline void init ( const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize )
|
||||
{
|
||||
/* first calculate total number of elements */
|
||||
size_t N = 0;
|
||||
for (size_t i=0; i<numArrays; i++) {
|
||||
N += getSize(i);
|
||||
}
|
||||
this->N = N;
|
||||
|
||||
/* calculate number of tasks to use */
|
||||
const size_t numThreads = TaskScheduler::threadCount();
|
||||
const size_t numBlocks = (N+minStepSize-1)/minStepSize;
|
||||
taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
|
||||
|
||||
/* calculate start (i,j) for each task */
|
||||
size_t taskIndex = 0;
|
||||
i0[taskIndex] = 0;
|
||||
j0[taskIndex] = 0;
|
||||
size_t k0 = (++taskIndex)*N/taskCount;
|
||||
for (size_t i=0, k=0; taskIndex < taskCount; i++)
|
||||
{
|
||||
assert(i<numArrays);
|
||||
size_t j=0, M = getSize(i);
|
||||
while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
|
||||
assert(taskIndex<taskCount);
|
||||
i0[taskIndex] = i;
|
||||
j0[taskIndex] = j += k0-k;
|
||||
k=k0;
|
||||
k0 = (++taskIndex)*N/taskCount;
|
||||
}
|
||||
k+=M-j;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename ArrayArray>
|
||||
__forceinline void init ( ArrayArray& array2, const size_t minStepSize )
|
||||
{
|
||||
init(array2.size(),[&](size_t i) { return array2[i] ? array2[i]->size() : 0; },minStepSize);
|
||||
}
|
||||
|
||||
__forceinline size_t size() const {
|
||||
return N;
|
||||
}
|
||||
|
||||
public:
|
||||
size_t i0[MAX_TASKS];
|
||||
size_t j0[MAX_TASKS];
|
||||
size_t taskCount;
|
||||
size_t N;
|
||||
};
|
||||
|
||||
template<typename ArrayArray, typename Func>
|
||||
__forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
|
||||
{
|
||||
ParallelForForState state(array2,minStepSize);
|
||||
|
||||
parallel_for(state.taskCount, [&](const size_t taskIndex)
|
||||
{
|
||||
/* calculate range */
|
||||
const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
|
||||
const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
|
||||
size_t i0 = state.i0[taskIndex];
|
||||
size_t j0 = state.j0[taskIndex];
|
||||
|
||||
/* iterate over arrays */
|
||||
size_t k=k0;
|
||||
for (size_t i=i0; k<k1; i++) {
|
||||
const size_t N = array2[i] ? array2[i]->size() : 0;
|
||||
const size_t r0 = j0, r1 = min(N,r0+k1-k);
|
||||
if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
|
||||
k+=r1-r0; j0 = 0;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template<typename ArrayArray, typename Func>
|
||||
__forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
|
||||
{
|
||||
parallel_for_for(array2,1,func);
|
||||
}
|
||||
|
||||
template<typename ArrayArray, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
|
||||
{
|
||||
ParallelForForState state(array2,minStepSize);
|
||||
Value temp[ParallelForForState::MAX_TASKS];
|
||||
|
||||
for (size_t i=0; i<state.taskCount; i++)
|
||||
temp[i] = identity;
|
||||
|
||||
parallel_for(state.taskCount, [&](const size_t taskIndex)
|
||||
{
|
||||
/* calculate range */
|
||||
const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
|
||||
const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
|
||||
size_t i0 = state.i0[taskIndex];
|
||||
size_t j0 = state.j0[taskIndex];
|
||||
|
||||
/* iterate over arrays */
|
||||
size_t k=k0;
|
||||
for (size_t i=i0; k<k1; i++) {
|
||||
const size_t N = array2[i] ? array2[i]->size() : 0;
|
||||
const size_t r0 = j0, r1 = min(N,r0+k1-k);
|
||||
if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
|
||||
k+=r1-r0; j0 = 0;
|
||||
}
|
||||
});
|
||||
|
||||
Value ret = identity;
|
||||
for (size_t i=0; i<state.taskCount; i++)
|
||||
ret = reduction(ret,temp[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<typename ArrayArray, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
return parallel_for_for_reduce(array2,1,identity,func,reduction);
|
||||
}
|
||||
}
|
142
thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
vendored
Normal file
142
thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
vendored
Normal file
@@ -0,0 +1,142 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parallel_for_for.h"
|
||||
#include "parallel_prefix_sum.h"
|
||||
|
||||
namespace embree
|
||||
{
|
||||
template<typename Value>
|
||||
struct ParallelForForPrefixSumState : public ParallelForForState
|
||||
{
|
||||
__forceinline ParallelForForPrefixSumState () {}
|
||||
|
||||
template<typename ArrayArray>
|
||||
__forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
|
||||
: ParallelForForState(array2,minStepSize) {}
|
||||
|
||||
template<typename SizeFunc>
|
||||
__forceinline ParallelForForPrefixSumState (size_t numArrays, const SizeFunc& getSize, const size_t minStepSize)
|
||||
: ParallelForForState(numArrays,getSize,minStepSize) {}
|
||||
|
||||
ParallelPrefixSumState<Value> prefix_state;
|
||||
};
|
||||
|
||||
template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum0_( ParallelForForPrefixSumState<Value>& state, Index minStepSize,
|
||||
const SizeFunc& getSize, const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
/* calculate number of tasks to use */
|
||||
const size_t taskCount = state.taskCount;
|
||||
|
||||
/* perform parallel prefix sum */
|
||||
parallel_for(taskCount, [&](const size_t taskIndex)
|
||||
{
|
||||
const size_t k0 = (taskIndex+0)*state.size()/taskCount;
|
||||
const size_t k1 = (taskIndex+1)*state.size()/taskCount;
|
||||
size_t i0 = state.i0[taskIndex];
|
||||
size_t j0 = state.j0[taskIndex];
|
||||
|
||||
/* iterate over arrays */
|
||||
size_t k=k0;
|
||||
Value N=identity;
|
||||
for (size_t i=i0; k<k1; i++) {
|
||||
const size_t size = getSize(i);
|
||||
const size_t r0 = j0, r1 = min(size,r0+k1-k);
|
||||
if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k));
|
||||
k+=r1-r0; j0 = 0;
|
||||
}
|
||||
state.prefix_state.counts[taskIndex] = N;
|
||||
});
|
||||
|
||||
/* calculate prefix sum */
|
||||
Value sum=identity;
|
||||
for (size_t i=0; i<taskCount; i++)
|
||||
{
|
||||
const Value c = state.prefix_state.counts[i];
|
||||
state.prefix_state.sums[i] = sum;
|
||||
sum=reduction(sum,c);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum1_( ParallelForForPrefixSumState<Value>& state, Index minStepSize,
|
||||
const SizeFunc& getSize,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
/* calculate number of tasks to use */
|
||||
const size_t taskCount = state.taskCount;
|
||||
/* perform parallel prefix sum */
|
||||
parallel_for(taskCount, [&](const size_t taskIndex)
|
||||
{
|
||||
const size_t k0 = (taskIndex+0)*state.size()/taskCount;
|
||||
const size_t k1 = (taskIndex+1)*state.size()/taskCount;
|
||||
size_t i0 = state.i0[taskIndex];
|
||||
size_t j0 = state.j0[taskIndex];
|
||||
|
||||
/* iterate over arrays */
|
||||
size_t k=k0;
|
||||
Value N=identity;
|
||||
for (size_t i=i0; k<k1; i++) {
|
||||
const size_t size = getSize(i);
|
||||
const size_t r0 = j0, r1 = min(size,r0+k1-k);
|
||||
if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k,reduction(state.prefix_state.sums[taskIndex],N)));
|
||||
k+=r1-r0; j0 = 0;
|
||||
}
|
||||
state.prefix_state.counts[taskIndex] = N;
|
||||
});
|
||||
|
||||
/* calculate prefix sum */
|
||||
Value sum=identity;
|
||||
for (size_t i=0; i<taskCount; i++)
|
||||
{
|
||||
const Value c = state.prefix_state.counts[i];
|
||||
state.prefix_state.sums[i] = sum;
|
||||
sum=reduction(sum,c);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state,
|
||||
ArrayArray& array2, Index minStepSize,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
return parallel_for_for_prefix_sum0_(state,minStepSize,
|
||||
[&](Index i) { return array2[i] ? array2[i]->size() : 0; },
|
||||
identity,
|
||||
[&](Index i, const range<Index>& r, Index k) { return func(array2[i], r, k, i); },
|
||||
reduction);
|
||||
}
|
||||
|
||||
template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state,
|
||||
ArrayArray& array2, Index minStepSize,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
return parallel_for_for_prefix_sum1_(state,minStepSize,
|
||||
[&](Index i) { return array2[i] ? array2[i]->size() : 0; },
|
||||
identity,
|
||||
[&](Index i, const range<Index>& r, Index k, const Value& base) { return func(array2[i], r, k, i, base); },
|
||||
reduction);
|
||||
}
|
||||
|
||||
template<typename ArrayArray, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
|
||||
}
|
||||
|
||||
template<typename ArrayArray, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
|
||||
const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
|
||||
}
|
||||
}
|
85
thirdparty/embree/common/algorithms/parallel_map.h
vendored
Normal file
85
thirdparty/embree/common/algorithms/parallel_map.h
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parallel_sort.h"
|
||||
|
||||
namespace embree
|
||||
{
|
||||
/*! implementation of a key/value map with parallel construction */
|
||||
template<typename Key, typename Val>
|
||||
class parallel_map
|
||||
{
|
||||
/* key/value pair to build the map */
|
||||
struct KeyValue
|
||||
{
|
||||
__forceinline KeyValue () {}
|
||||
|
||||
__forceinline KeyValue (const Key key, const Val val)
|
||||
: key(key), val(val) {}
|
||||
|
||||
__forceinline operator Key() const {
|
||||
return key;
|
||||
}
|
||||
|
||||
public:
|
||||
Key key;
|
||||
Val val;
|
||||
};
|
||||
|
||||
public:
|
||||
|
||||
/*! parallel map constructors */
|
||||
parallel_map () {}
|
||||
|
||||
/*! construction from pair of vectors */
|
||||
template<typename KeyVector, typename ValVector>
|
||||
parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
|
||||
|
||||
/*! initialized the parallel map from a vector with keys and values */
|
||||
template<typename KeyVector, typename ValVector>
|
||||
void init(const KeyVector& keys, const ValVector& values)
|
||||
{
|
||||
/* reserve sufficient space for all data */
|
||||
assert(keys.size() == values.size());
|
||||
vec.resize(keys.size());
|
||||
|
||||
/* generate key/value pairs */
|
||||
parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
|
||||
for (size_t i=r.begin(); i<r.end(); i++)
|
||||
vec[i] = KeyValue((Key)keys[i],values[i]);
|
||||
});
|
||||
|
||||
/* perform parallel radix sort of the key/value pairs */
|
||||
std::vector<KeyValue> temp(keys.size());
|
||||
radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
|
||||
}
|
||||
|
||||
/*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
|
||||
__forceinline const Val* lookup(const Key& key) const
|
||||
{
|
||||
typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
|
||||
if (i == vec.end()) return nullptr;
|
||||
if (i->key != key) return nullptr;
|
||||
return &i->val;
|
||||
}
|
||||
|
||||
/*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
|
||||
__forceinline Val lookup(const Key& key, const Val& def) const
|
||||
{
|
||||
typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
|
||||
if (i == vec.end()) return def;
|
||||
if (i->key != key) return def;
|
||||
return i->val;
|
||||
}
|
||||
|
||||
/*! clears all state */
|
||||
void clear() {
|
||||
vec.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<KeyValue> vec; //!< vector containing sorted elements
|
||||
};
|
||||
}
|
283
thirdparty/embree/common/algorithms/parallel_partition.h
vendored
Normal file
283
thirdparty/embree/common/algorithms/parallel_partition.h
vendored
Normal file
@@ -0,0 +1,283 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parallel_for.h"
|
||||
#include "../math/range.h"
|
||||
|
||||
namespace embree
|
||||
{
|
||||
/* serial partitioning */
|
||||
template<typename T, typename V, typename IsLeft, typename Reduction_T>
|
||||
__forceinline size_t serial_partitioning(T* array,
|
||||
const size_t begin,
|
||||
const size_t end,
|
||||
V& leftReduction,
|
||||
V& rightReduction,
|
||||
const IsLeft& is_left,
|
||||
const Reduction_T& reduction_t)
|
||||
{
|
||||
T* l = array + begin;
|
||||
T* r = array + end - 1;
|
||||
|
||||
while(1)
|
||||
{
|
||||
/* *l < pivot */
|
||||
while (likely(l <= r && is_left(*l) ))
|
||||
{
|
||||
//prefetchw(l+4); // FIXME: enable?
|
||||
reduction_t(leftReduction,*l);
|
||||
++l;
|
||||
}
|
||||
/* *r >= pivot) */
|
||||
while (likely(l <= r && !is_left(*r)))
|
||||
{
|
||||
//prefetchw(r-4); FIXME: enable?
|
||||
reduction_t(rightReduction,*r);
|
||||
--r;
|
||||
}
|
||||
if (r<l) break;
|
||||
|
||||
reduction_t(leftReduction ,*r);
|
||||
reduction_t(rightReduction,*l);
|
||||
xchg(*l,*r);
|
||||
l++; r--;
|
||||
}
|
||||
|
||||
return l - array;
|
||||
}
|
||||
|
||||
template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
|
||||
class __aligned(64) parallel_partition_task
|
||||
{
|
||||
ALIGNED_CLASS_(64);
|
||||
private:
|
||||
|
||||
static const size_t MAX_TASKS = 64;
|
||||
|
||||
T* array;
|
||||
size_t N;
|
||||
const IsLeft& is_left;
|
||||
const Reduction_T& reduction_t;
|
||||
const Reduction_V& reduction_v;
|
||||
const Vi& identity;
|
||||
|
||||
size_t numTasks;
|
||||
__aligned(64) size_t counter_start[MAX_TASKS+1];
|
||||
__aligned(64) size_t counter_left[MAX_TASKS+1];
|
||||
__aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];
|
||||
__aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS];
|
||||
__aligned(64) V leftReductions[MAX_TASKS];
|
||||
__aligned(64) V rightReductions[MAX_TASKS];
|
||||
|
||||
public:
|
||||
|
||||
__forceinline parallel_partition_task(T* array,
|
||||
const size_t N,
|
||||
const Vi& identity,
|
||||
const IsLeft& is_left,
|
||||
const Reduction_T& reduction_t,
|
||||
const Reduction_V& reduction_v,
|
||||
const size_t BLOCK_SIZE)
|
||||
|
||||
: array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
|
||||
numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
|
||||
|
||||
__forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
|
||||
{
|
||||
size_t i = 0;
|
||||
while(index >= (size_t)r[i].size())
|
||||
{
|
||||
assert(i < numRanges);
|
||||
index -= (size_t)r[i].size();
|
||||
i++;
|
||||
}
|
||||
return &r[i];
|
||||
}
|
||||
|
||||
__forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
|
||||
const size_t numRightMisplacedRanges,
|
||||
const size_t startID,
|
||||
const size_t endID)
|
||||
{
|
||||
size_t leftLocalIndex = startID;
|
||||
size_t rightLocalIndex = startID;
|
||||
const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
|
||||
const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
|
||||
|
||||
size_t l_left = l_range->size() - leftLocalIndex;
|
||||
size_t r_left = r_range->size() - rightLocalIndex;
|
||||
T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
|
||||
T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
|
||||
size_t size = endID - startID;
|
||||
size_t items = min(size,min(l_left,r_left));
|
||||
|
||||
while (size)
|
||||
{
|
||||
if (unlikely(l_left == 0))
|
||||
{
|
||||
l_range++;
|
||||
l_left = l_range->size();
|
||||
l = &array[l_range->begin()];
|
||||
items = min(size,min(l_left,r_left));
|
||||
}
|
||||
|
||||
if (unlikely(r_left == 0))
|
||||
{
|
||||
r_range++;
|
||||
r_left = r_range->size();
|
||||
r = &array[r_range->begin()];
|
||||
items = min(size,min(l_left,r_left));
|
||||
}
|
||||
|
||||
size -= items;
|
||||
l_left -= items;
|
||||
r_left -= items;
|
||||
|
||||
while(items) {
|
||||
items--;
|
||||
xchg(*l++,*r++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__forceinline size_t partition(V& leftReduction, V& rightReduction)
|
||||
{
|
||||
/* partition the individual ranges for each task */
|
||||
parallel_for(numTasks,[&] (const size_t taskID) {
|
||||
const size_t startID = (taskID+0)*N/numTasks;
|
||||
const size_t endID = (taskID+1)*N/numTasks;
|
||||
V local_left(identity);
|
||||
V local_right(identity);
|
||||
const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
|
||||
counter_start[taskID] = startID;
|
||||
counter_left [taskID] = mid-startID;
|
||||
leftReductions[taskID] = local_left;
|
||||
rightReductions[taskID] = local_right;
|
||||
});
|
||||
counter_start[numTasks] = N;
|
||||
counter_left[numTasks] = 0;
|
||||
|
||||
/* finalize the reductions */
|
||||
for (size_t i=0; i<numTasks; i++) {
|
||||
reduction_v(leftReduction,leftReductions[i]);
|
||||
reduction_v(rightReduction,rightReductions[i]);
|
||||
}
|
||||
|
||||
/* calculate mid point for partitioning */
|
||||
size_t mid = counter_left[0];
|
||||
for (size_t i=1; i<numTasks; i++)
|
||||
mid += counter_left[i];
|
||||
const range<ssize_t> globalLeft (0,mid);
|
||||
const range<ssize_t> globalRight(mid,N);
|
||||
|
||||
/* calculate all left and right ranges that are on the wrong global side */
|
||||
size_t numMisplacedRangesLeft = 0;
|
||||
size_t numMisplacedRangesRight = 0;
|
||||
size_t numMisplacedItemsLeft MAYBE_UNUSED = 0;
|
||||
size_t numMisplacedItemsRight MAYBE_UNUSED = 0;
|
||||
|
||||
for (size_t i=0; i<numTasks; i++)
|
||||
{
|
||||
const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
|
||||
const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
|
||||
const range<ssize_t> left_misplaced = globalLeft. intersect(right_range);
|
||||
const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
|
||||
|
||||
if (!left_misplaced.empty())
|
||||
{
|
||||
numMisplacedItemsLeft += left_misplaced.size();
|
||||
leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
|
||||
}
|
||||
|
||||
if (!right_misplaced.empty())
|
||||
{
|
||||
numMisplacedItemsRight += right_misplaced.size();
|
||||
rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
|
||||
}
|
||||
}
|
||||
assert( numMisplacedItemsLeft == numMisplacedItemsRight );
|
||||
|
||||
/* if no items are misplaced we are done */
|
||||
if (numMisplacedItemsLeft == 0)
|
||||
return mid;
|
||||
|
||||
/* otherwise we copy the items to the right place in parallel */
|
||||
parallel_for(numTasks,[&] (const size_t taskID) {
|
||||
const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
|
||||
const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks;
|
||||
swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);
|
||||
});
|
||||
|
||||
return mid;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
|
||||
__noinline size_t parallel_partitioning(T* array,
|
||||
const size_t begin,
|
||||
const size_t end,
|
||||
const Vi &identity,
|
||||
V &leftReduction,
|
||||
V &rightReduction,
|
||||
const IsLeft& is_left,
|
||||
const Reduction_T& reduction_t,
|
||||
const Reduction_V& reduction_v,
|
||||
size_t BLOCK_SIZE = 128)
|
||||
{
|
||||
/* fall back to single threaded partitioning for small N */
|
||||
if (unlikely(end-begin < BLOCK_SIZE))
|
||||
return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
|
||||
|
||||
/* otherwise use parallel code */
|
||||
else {
|
||||
typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
|
||||
std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
|
||||
return begin+p->partition(leftReduction,rightReduction);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
|
||||
__noinline size_t parallel_partitioning(T* array,
|
||||
const size_t begin,
|
||||
const size_t end,
|
||||
const Vi &identity,
|
||||
V &leftReduction,
|
||||
V &rightReduction,
|
||||
const IsLeft& is_left,
|
||||
const Reduction_T& reduction_t,
|
||||
const Reduction_V& reduction_v,
|
||||
size_t BLOCK_SIZE,
|
||||
size_t PARALLEL_THRESHOLD)
|
||||
{
|
||||
/* fall back to single threaded partitioning for small N */
|
||||
if (unlikely(end-begin < PARALLEL_THRESHOLD))
|
||||
return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
|
||||
|
||||
/* otherwise use parallel code */
|
||||
else {
|
||||
typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
|
||||
std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
|
||||
return begin+p->partition(leftReduction,rightReduction);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T, typename IsLeft>
|
||||
inline size_t parallel_partitioning(T* array,
|
||||
const size_t begin,
|
||||
const size_t end,
|
||||
const IsLeft& is_left,
|
||||
size_t BLOCK_SIZE = 128)
|
||||
{
|
||||
size_t leftReduction = 0;
|
||||
size_t rightReduction = 0;
|
||||
return parallel_partitioning(
|
||||
array,begin,end,0,leftReduction,rightReduction,is_left,
|
||||
[] (size_t& t,const T& ref) { },
|
||||
[] (size_t& t0,size_t& t1) { },
|
||||
BLOCK_SIZE);
|
||||
}
|
||||
|
||||
}
|
85
thirdparty/embree/common/algorithms/parallel_prefix_sum.h
vendored
Normal file
85
thirdparty/embree/common/algorithms/parallel_prefix_sum.h
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parallel_for.h"
|
||||
|
||||
namespace embree
|
||||
{
|
||||
template<typename Value>
|
||||
struct ParallelPrefixSumState
|
||||
{
|
||||
enum { MAX_TASKS = 64 };
|
||||
Value counts[MAX_TASKS];
|
||||
Value sums [MAX_TASKS];
|
||||
};
|
||||
|
||||
template<typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
|
||||
{
|
||||
/* calculate number of tasks to use */
|
||||
const size_t numThreads = TaskScheduler::threadCount();
|
||||
const size_t numBlocks = (last-first+minStepSize-1)/minStepSize;
|
||||
const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
|
||||
|
||||
/* perform parallel prefix sum */
|
||||
parallel_for(taskCount, [&](const size_t taskIndex)
|
||||
{
|
||||
const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
|
||||
const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
|
||||
state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
|
||||
});
|
||||
|
||||
/* calculate prefix sum */
|
||||
Value sum=identity;
|
||||
for (size_t i=0; i<taskCount; i++)
|
||||
{
|
||||
const Value c = state.counts[i];
|
||||
state.sums[i] = sum;
|
||||
sum=reduction(sum,c);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/*! parallel calculation of prefix sums */
|
||||
template<typename SrcArray, typename DstArray, typename Value, typename Add>
|
||||
__forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)
|
||||
{
|
||||
/* perform single threaded prefix operation for small N */
|
||||
if (N < SINGLE_THREAD_THRESHOLD)
|
||||
{
|
||||
Value sum=identity;
|
||||
for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
|
||||
return sum;
|
||||
}
|
||||
|
||||
/* perform parallel prefix operation for large N */
|
||||
else
|
||||
{
|
||||
ParallelPrefixSumState<Value> state;
|
||||
|
||||
/* initial run just sets up start values for subtasks */
|
||||
parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
|
||||
|
||||
Value s = identity;
|
||||
for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
|
||||
return s;
|
||||
|
||||
}, add);
|
||||
|
||||
/* final run calculates prefix sum */
|
||||
return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
|
||||
|
||||
Value s = identity;
|
||||
for (size_t i=r.begin(); i<r.end(); i++) {
|
||||
dst[i] = add(sum,s);
|
||||
s = add(s,src[i]);
|
||||
}
|
||||
return s;
|
||||
|
||||
}, add);
|
||||
}
|
||||
}
|
||||
}
|
146
thirdparty/embree/common/algorithms/parallel_reduce.h
vendored
Normal file
146
thirdparty/embree/common/algorithms/parallel_reduce.h
vendored
Normal file
@@ -0,0 +1,146 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parallel_for.h"
|
||||
|
||||
namespace embree
|
||||
{
|
||||
template<typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
|
||||
{
|
||||
return func(range<Index>(first,last));
|
||||
}
|
||||
|
||||
template<typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
|
||||
{
|
||||
return func(range<Index>(first,last));
|
||||
}
|
||||
|
||||
template<typename Index, typename Value, typename Func, typename Reduction>
|
||||
__noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
|
||||
{
|
||||
const Index maxTasks = 512;
|
||||
const Index threadCount = (Index) TaskScheduler::threadCount();
|
||||
taskCount = min(taskCount,threadCount,maxTasks);
|
||||
|
||||
/* parallel invocation of all tasks */
|
||||
dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
|
||||
parallel_for(taskCount, [&](const Index taskIndex) {
|
||||
const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
|
||||
const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
|
||||
values[taskIndex] = func(range<Index>(k0,k1));
|
||||
});
|
||||
|
||||
/* perform reduction over all tasks */
|
||||
Value v = identity;
|
||||
for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
|
||||
return v;
|
||||
}
|
||||
|
||||
template<typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
|
||||
{
|
||||
#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
|
||||
|
||||
/* fast path for small number of iterations */
|
||||
Index taskCount = (last-first+minStepSize-1)/minStepSize;
|
||||
if (likely(taskCount == 1)) {
|
||||
return func(range<Index>(first,last));
|
||||
}
|
||||
return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
|
||||
|
||||
#elif defined(TASKING_TBB)
|
||||
#if TBB_INTERFACE_VERSION >= 12002
|
||||
tbb::task_group_context context;
|
||||
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
|
||||
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
|
||||
reduction,context);
|
||||
//if (context.is_group_execution_cancelled())
|
||||
// throw std::runtime_error("task cancelled");
|
||||
return v;
|
||||
#else
|
||||
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
|
||||
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
|
||||
reduction);
|
||||
//if (tbb::task::self().is_cancelled())
|
||||
// throw std::runtime_error("task cancelled");
|
||||
return v;
|
||||
#endif
|
||||
#else // TASKING_PPL
|
||||
struct AlignedValue
|
||||
{
|
||||
char storage[__alignof(Value)+sizeof(Value)];
|
||||
static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
|
||||
Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
|
||||
const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
|
||||
AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
|
||||
AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
|
||||
AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
|
||||
AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
|
||||
AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
|
||||
operator Value() const { return *getValuePtr(); }
|
||||
};
|
||||
|
||||
struct Iterator_Index
|
||||
{
|
||||
Index v;
|
||||
typedef std::forward_iterator_tag iterator_category;
|
||||
typedef AlignedValue value_type;
|
||||
typedef Index difference_type;
|
||||
typedef Index distance_type;
|
||||
typedef AlignedValue* pointer;
|
||||
typedef AlignedValue& reference;
|
||||
__forceinline Iterator_Index() {}
|
||||
__forceinline Iterator_Index(Index v) : v(v) {}
|
||||
__forceinline bool operator== (Iterator_Index other) { return v == other.v; }
|
||||
__forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
|
||||
__forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
|
||||
__forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
|
||||
};
|
||||
|
||||
auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
|
||||
assert(begin.v < end.v);
|
||||
return reduction(start, func(range<Index>(begin.v, end.v)));
|
||||
};
|
||||
const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
|
||||
return v;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
|
||||
{
|
||||
if (likely(last-first < parallel_threshold)) {
|
||||
return func(range<Index>(first,last));
|
||||
} else {
|
||||
return parallel_reduce(first,last,minStepSize,identity,func,reduction);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
|
||||
{
|
||||
return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
|
||||
}
|
||||
|
||||
template<typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
|
||||
{
|
||||
auto funcr = [&] ( const range<Index> r ) {
|
||||
Value v = identity;
|
||||
for (Index i=r.begin(); i<r.end(); i++)
|
||||
v = reduction(v,func(i));
|
||||
return v;
|
||||
};
|
||||
return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
|
||||
}
|
||||
|
||||
template<typename Index, typename Value, typename Func, typename Reduction>
|
||||
__forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
|
||||
{
|
||||
return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
|
||||
}
|
||||
}
|
52
thirdparty/embree/common/algorithms/parallel_set.h
vendored
Normal file
52
thirdparty/embree/common/algorithms/parallel_set.h
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parallel_sort.h"
|
||||
|
||||
namespace embree
|
||||
{
|
||||
/* implementation of a set of values with parallel construction */
|
||||
template<typename T>
|
||||
class parallel_set
|
||||
{
|
||||
public:
|
||||
|
||||
/*! default constructor for the parallel set */
|
||||
parallel_set () {}
|
||||
|
||||
/*! construction from vector */
|
||||
template<typename Vector>
|
||||
parallel_set (const Vector& in) { init(in); }
|
||||
|
||||
/*! initialized the parallel set from a vector */
|
||||
template<typename Vector>
|
||||
void init(const Vector& in)
|
||||
{
|
||||
/* copy data to internal vector */
|
||||
vec.resize(in.size());
|
||||
parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
|
||||
for (size_t i=r.begin(); i<r.end(); i++)
|
||||
vec[i] = in[i];
|
||||
});
|
||||
|
||||
/* sort the data */
|
||||
std::vector<T> temp(in.size());
|
||||
radix_sort<T>(vec.data(),temp.data(),vec.size());
|
||||
}
|
||||
|
||||
/*! tests if some element is in the set */
|
||||
__forceinline bool lookup(const T& elt) const {
|
||||
return std::binary_search(vec.begin(), vec.end(), elt);
|
||||
}
|
||||
|
||||
/*! clears all state */
|
||||
void clear() {
|
||||
vec.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<T> vec; //!< vector containing sorted elements
|
||||
};
|
||||
}
|
454
thirdparty/embree/common/algorithms/parallel_sort.h
vendored
Normal file
454
thirdparty/embree/common/algorithms/parallel_sort.h
vendored
Normal file
@@ -0,0 +1,454 @@
|
||||
// Copyright 2009-2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../simd/simd.h"
|
||||
#include "parallel_for.h"
|
||||
#include <algorithm>
|
||||
|
||||
namespace embree
|
||||
{
|
||||
template<class T>
|
||||
__forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
|
||||
{
|
||||
for(size_t i = 1;i<length;++i)
|
||||
{
|
||||
T v = array[i];
|
||||
size_t j = i;
|
||||
while(j > 0 && v < array[j-1])
|
||||
{
|
||||
array[j] = array[j-1];
|
||||
--j;
|
||||
}
|
||||
array[j] = v;
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
__forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
|
||||
{
|
||||
for(size_t i = 1;i<length;++i)
|
||||
{
|
||||
T v = array[i];
|
||||
size_t j = i;
|
||||
while(j > 0 && v > array[j-1])
|
||||
{
|
||||
array[j] = array[j-1];
|
||||
--j;
|
||||
}
|
||||
array[j] = v;
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void quicksort_ascending(T *__restrict__ t,
|
||||
const ssize_t begin,
|
||||
const ssize_t end)
|
||||
{
|
||||
if (likely(begin < end))
|
||||
{
|
||||
const T pivotvalue = t[begin];
|
||||
ssize_t left = begin - 1;
|
||||
ssize_t right = end + 1;
|
||||
|
||||
while(1)
|
||||
{
|
||||
while (t[--right] > pivotvalue);
|
||||
while (t[++left] < pivotvalue);
|
||||
|
||||
if (left >= right) break;
|
||||
|
||||
const T temp = t[right];
|
||||
t[right] = t[left];
|
||||
t[left] = temp;
|
||||
}
|
||||
|
||||
const int pivot = right;
|
||||
quicksort_ascending(t, begin, pivot);
|
||||
quicksort_ascending(t, pivot + 1, end);
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void quicksort_decending(T *__restrict__ t,
|
||||
const ssize_t begin,
|
||||
const ssize_t end)
|
||||
{
|
||||
if (likely(begin < end))
|
||||
{
|
||||
const T pivotvalue = t[begin];
|
||||
ssize_t left = begin - 1;
|
||||
ssize_t right = end + 1;
|
||||
|
||||
while(1)
|
||||
{
|
||||
while (t[--right] < pivotvalue);
|
||||
while (t[++left] > pivotvalue);
|
||||
|
||||
if (left >= right) break;
|
||||
|
||||
const T temp = t[right];
|
||||
t[right] = t[left];
|
||||
t[left] = temp;
|
||||
}
|
||||
|
||||
const int pivot = right;
|
||||
quicksort_decending(t, begin, pivot);
|
||||
quicksort_decending(t, pivot + 1, end);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class T, ssize_t THRESHOLD>
|
||||
void quicksort_insertionsort_ascending(T *__restrict__ t,
|
||||
const ssize_t begin,
|
||||
const ssize_t end)
|
||||
{
|
||||
if (likely(begin < end))
|
||||
{
|
||||
const ssize_t size = end-begin+1;
|
||||
if (likely(size <= THRESHOLD))
|
||||
{
|
||||
insertionsort_ascending<T>(&t[begin],size);
|
||||
}
|
||||
else
|
||||
{
|
||||
const T pivotvalue = t[begin];
|
||||
ssize_t left = begin - 1;
|
||||
ssize_t right = end + 1;
|
||||
|
||||
while(1)
|
||||
{
|
||||
while (t[--right] > pivotvalue);
|
||||
while (t[++left] < pivotvalue);
|
||||
|
||||
if (left >= right) break;
|
||||
|
||||
const T temp = t[right];
|
||||
t[right] = t[left];
|
||||
t[left] = temp;
|
||||
}
|
||||
|
||||
const ssize_t pivot = right;
|
||||
quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
|
||||
quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class T, ssize_t THRESHOLD>
|
||||
void quicksort_insertionsort_decending(T *__restrict__ t,
|
||||
const ssize_t begin,
|
||||
const ssize_t end)
|
||||
{
|
||||
if (likely(begin < end))
|
||||
{
|
||||
const ssize_t size = end-begin+1;
|
||||
if (likely(size <= THRESHOLD))
|
||||
{
|
||||
insertionsort_decending<T>(&t[begin],size);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
const T pivotvalue = t[begin];
|
||||
ssize_t left = begin - 1;
|
||||
ssize_t right = end + 1;
|
||||
|
||||
while(1)
|
||||
{
|
||||
while (t[--right] < pivotvalue);
|
||||
while (t[++left] > pivotvalue);
|
||||
|
||||
if (left >= right) break;
|
||||
|
||||
const T temp = t[right];
|
||||
t[right] = t[left];
|
||||
t[left] = temp;
|
||||
}
|
||||
|
||||
const ssize_t pivot = right;
|
||||
quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
|
||||
quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
|
||||
{
|
||||
static const unsigned int BITS = 8;
|
||||
static const unsigned int BUCKETS = (1 << BITS);
|
||||
static const unsigned int CMP_SORT_THRESHOLD = 16;
|
||||
|
||||
__aligned(64) unsigned int count[BUCKETS];
|
||||
|
||||
/* clear buckets */
|
||||
for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
|
||||
|
||||
/* count buckets */
|
||||
#if defined(__INTEL_COMPILER)
|
||||
#pragma nounroll
|
||||
#endif
|
||||
for (size_t i=0;i<num;i++)
|
||||
count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
|
||||
|
||||
/* prefix sums */
|
||||
__aligned(64) unsigned int head[BUCKETS];
|
||||
__aligned(64) unsigned int tail[BUCKETS];
|
||||
|
||||
head[0] = 0;
|
||||
for (size_t i=1; i<BUCKETS; i++)
|
||||
head[i] = head[i-1] + count[i-1];
|
||||
|
||||
for (size_t i=0; i<BUCKETS-1; i++)
|
||||
tail[i] = head[i+1];
|
||||
|
||||
tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
|
||||
|
||||
assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);
|
||||
assert(tail[BUCKETS-1] == num);
|
||||
|
||||
/* in-place swap */
|
||||
for (size_t i=0;i<BUCKETS;i++)
|
||||
{
|
||||
/* process bucket */
|
||||
while(head[i] < tail[i])
|
||||
{
|
||||
T v = morton[head[i]];
|
||||
while(1)
|
||||
{
|
||||
const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
|
||||
if (b == i) break;
|
||||
std::swap(v,morton[head[b]++]);
|
||||
}
|
||||
assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
|
||||
morton[head[i]++] = v;
|
||||
}
|
||||
}
|
||||
if (shift == 0) return;
|
||||
|
||||
size_t offset = 0;
|
||||
for (size_t i=0;i<BUCKETS;i++)
|
||||
if (count[i])
|
||||
{
|
||||
|
||||
for (size_t j=offset;j<offset+count[i]-1;j++)
|
||||
assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
|
||||
|
||||
if (unlikely(count[i] < CMP_SORT_THRESHOLD))
|
||||
insertionsort_ascending(morton + offset, count[i]);
|
||||
else
|
||||
radixsort32(morton + offset, count[i], shift-BITS);
|
||||
|
||||
for (size_t j=offset;j<offset+count[i]-1;j++)
|
||||
assert(morton[j] <= morton[j+1]);
|
||||
|
||||
offset += count[i];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Ty, typename Key>
|
||||
class ParallelRadixSort
|
||||
{
|
||||
static const size_t MAX_TASKS = 64;
|
||||
static const size_t BITS = 8;
|
||||
static const size_t BUCKETS = (1 << BITS);
|
||||
typedef unsigned int TyRadixCount[BUCKETS];
|
||||
|
||||
template<typename T>
|
||||
static bool compare(const T& v0, const T& v1) {
|
||||
return (Key)v0 < (Key)v1;
|
||||
}
|
||||
|
||||
private:
|
||||
ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
|
||||
ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
|
||||
|
||||
|
||||
public:
|
||||
ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
|
||||
: radixCount(nullptr), src(src), tmp(tmp), N(N) {}
|
||||
|
||||
void sort(const size_t blockSize)
|
||||
{
|
||||
assert(blockSize > 0);
|
||||
|
||||
/* perform single threaded sort for small N */
|
||||
if (N<=blockSize) // handles also special case of 0!
|
||||
{
|
||||
/* do inplace sort inside destination array */
|
||||
std::sort(src,src+N,compare<Ty>);
|
||||
}
|
||||
|
||||
/* perform parallel sort for large N */
|
||||
else
|
||||
{
|
||||
const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
|
||||
tbbRadixSort(numThreads);
|
||||
}
|
||||
}
|
||||
|
||||
~ParallelRadixSort()
|
||||
{
|
||||
alignedFree(radixCount);
|
||||
radixCount = nullptr;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void tbbRadixIteration0(const Key shift,
|
||||
const Ty* __restrict const src,
|
||||
Ty* __restrict const dst,
|
||||
const size_t threadIndex, const size_t threadCount)
|
||||
{
|
||||
const size_t startID = (threadIndex+0)*N/threadCount;
|
||||
const size_t endID = (threadIndex+1)*N/threadCount;
|
||||
|
||||
/* mask to extract some number of bits */
|
||||
const Key mask = BUCKETS-1;
|
||||
|
||||
/* count how many items go into the buckets */
|
||||
for (size_t i=0; i<BUCKETS; i++)
|
||||
radixCount[threadIndex][i] = 0;
|
||||
|
||||
/* iterate over src array and count buckets */
|
||||
unsigned int * __restrict const count = radixCount[threadIndex];
|
||||
#if defined(__INTEL_COMPILER)
|
||||
#pragma nounroll
|
||||
#endif
|
||||
for (size_t i=startID; i<endID; i++) {
|
||||
#if defined(__64BIT__)
|
||||
const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
|
||||
#else
|
||||
const Key index = ((Key)src[i] >> shift) & mask;
|
||||
#endif
|
||||
count[index]++;
|
||||
}
|
||||
}
|
||||
|
||||
void tbbRadixIteration1(const Key shift,
|
||||
const Ty* __restrict const src,
|
||||
Ty* __restrict const dst,
|
||||
const size_t threadIndex, const size_t threadCount)
|
||||
{
|
||||
const size_t startID = (threadIndex+0)*N/threadCount;
|
||||
const size_t endID = (threadIndex+1)*N/threadCount;
|
||||
|
||||
/* mask to extract some number of bits */
|
||||
const Key mask = BUCKETS-1;
|
||||
|
||||
/* calculate total number of items for each bucket */
|
||||
__aligned(64) unsigned int total[BUCKETS];
|
||||
/*
|
||||
for (size_t i=0; i<BUCKETS; i++)
|
||||
total[i] = 0;
|
||||
*/
|
||||
for (size_t i=0; i<BUCKETS; i+=VSIZEX)
|
||||
vintx::store(&total[i], zero);
|
||||
|
||||
for (size_t i=0; i<threadCount; i++)
|
||||
{
|
||||
/*
|
||||
for (size_t j=0; j<BUCKETS; j++)
|
||||
total[j] += radixCount[i][j];
|
||||
*/
|
||||
for (size_t j=0; j<BUCKETS; j+=VSIZEX)
|
||||
vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
|
||||
}
|
||||
|
||||
/* calculate start offset of each bucket */
|
||||
__aligned(64) unsigned int offset[BUCKETS];
|
||||
offset[0] = 0;
|
||||
for (size_t i=1; i<BUCKETS; i++)
|
||||
offset[i] = offset[i-1] + total[i-1];
|
||||
|
||||
/* calculate start offset of each bucket for this thread */
|
||||
for (size_t i=0; i<threadIndex; i++)
|
||||
{
|
||||
/*
|
||||
for (size_t j=0; j<BUCKETS; j++)
|
||||
offset[j] += radixCount[i][j];
|
||||
*/
|
||||
for (size_t j=0; j<BUCKETS; j+=VSIZEX)
|
||||
vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
|
||||
}
|
||||
|
||||
/* copy items into their buckets */
|
||||
#if defined(__INTEL_COMPILER)
|
||||
#pragma nounroll
|
||||
#endif
|
||||
for (size_t i=startID; i<endID; i++) {
|
||||
const Ty elt = src[i];
|
||||
#if defined(__64BIT__)
|
||||
const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
|
||||
#else
|
||||
const size_t index = ((Key)src[i] >> shift) & mask;
|
||||
#endif
|
||||
dst[offset[index]++] = elt;
|
||||
}
|
||||
}
|
||||
|
||||
void tbbRadixIteration(const Key shift, const bool last,
|
||||
const Ty* __restrict src, Ty* __restrict dst,
|
||||
const size_t numTasks)
|
||||
{
|
||||
affinity_partitioner ap;
|
||||
parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
|
||||
parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
|
||||
}
|
||||
|
||||
void tbbRadixSort(const size_t numTasks)
|
||||
{
|
||||
radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
|
||||
|
||||
if (sizeof(Key) == sizeof(uint32_t)) {
|
||||
tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
|
||||
tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
|
||||
tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
|
||||
tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
|
||||
}
|
||||
else if (sizeof(Key) == sizeof(uint64_t))
|
||||
{
|
||||
tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
|
||||
tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
|
||||
tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
|
||||
tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
|
||||
tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
|
||||
tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
|
||||
tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
|
||||
tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
TyRadixCount* radixCount;
|
||||
Ty* const src;
|
||||
Ty* const tmp;
|
||||
const size_t N;
|
||||
};
|
||||
|
||||
template<typename Ty>
|
||||
void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
|
||||
{
|
||||
ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
|
||||
}
|
||||
|
||||
template<typename Ty, typename Key>
|
||||
void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
|
||||
{
|
||||
ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
|
||||
}
|
||||
|
||||
template<typename Ty>
|
||||
void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
|
||||
radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
|
||||
}
|
||||
|
||||
template<typename Ty>
|
||||
void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
|
||||
radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user