initial commit, 4.5 stable

2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions
--- a/thirdparty/embree/LICENSE.txt
+++ b/thirdparty/embree/LICENSE.txt
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/thirdparty/embree/common/algorithms/parallel_any_of.h
+++ b/thirdparty/embree/common/algorithms/parallel_any_of.h
@@ -0,0 +1,56 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <functional>
+#include "parallel_reduce.h"
+
+namespace embree
+{
+  
+  template<typename Index, class UnaryPredicate>
+    __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
+  {
+    std::atomic_bool ret;
+    ret = false;
+    
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
+        if (context.is_group_execution_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            context.cancel_group_execution();
+          }
+        }
+      });
+#else
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
+        if (tbb::task::self().is_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            tbb::task::self().cancel_group_execution();
+          }
+        }
+      });
+#endif
+#else
+    ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
+        bool localret = false;
+        for (auto i=r.begin(); i<r.end(); ++i) {
+          localret |= pred(i);
+        }
+        return localret;
+      },
+      std::bit_or<bool>()
+      );
+#endif
+    
+    return ret;
+  }
+  
+} // end namespace
--- a/thirdparty/embree/common/algorithms/parallel_filter.h
+++ b/thirdparty/embree/common/algorithms/parallel_filter.h
@@ -0,0 +1,93 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
+  {
+    Index j = first;
+    for (Index i=first; i<last; i++)
+      if (predicate(data[i]))
+        data[j++] = data[i];
+
+    return j;
+  }
+
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
+  {
+    /* sequential fallback */
+    if (end-begin <= minStepSize)
+      return sequential_filter(data,begin,end,predicate);
+
+    /* calculate number of tasks to use */
+    enum { MAX_TASKS = 64 };
+    const Index numThreads = TaskScheduler::threadCount();
+    const Index numBlocks  = (end-begin+minStepSize-1)/minStepSize;
+    const Index taskCount  = min(numThreads,numBlocks,(Index)MAX_TASKS);
+
+    /* filter blocks */
+    Index nused[MAX_TASKS];
+    Index nfree[MAX_TASKS];
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
+      const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
+      const Index i2 = sequential_filter(data,i0,i1,predicate);
+      nused[taskIndex] = i2-i0;
+      nfree[taskIndex] = i1-i2;
+    });
+
+    /* calculate offsets */
+    Index sused=0;
+    Index sfree=0;
+    Index pfree[MAX_TASKS];
+    for (Index i=0; i<taskCount; i++) 
+    {
+      sused+=nused[i];
+      Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
+    }
+
+    /* return if we did not filter out any element */
+    assert(sfree <= end-begin);
+    assert(sused <= end-begin);
+    if (sused == end-begin)
+      return end;
+
+    /* otherwise we have to copy misplaced elements around */
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      /* destination to write elements to */
+      Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
+      Index dst_end = min(dst+nfree[taskIndex],begin+sused);
+      if (dst_end <= dst) return;
+
+      /* range of misplaced elements to copy to destination */
+      Index r0 = pfree[taskIndex];
+      Index r1 = r0+dst_end-dst;
+
+      /* find range in misplaced elements in back to front order */
+      Index k0=0;
+      for (Index i=taskCount-1; i>0; i--)
+      {
+        if (k0 > r1) break;
+        Index k1 = k0+nused[i];
+        Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
+        for (Index i=max(r0,k0); i<min(r1,k1); i++) {
+          Index isrc = src-i+k0-1;
+          assert(dst >= begin && dst < end);
+          assert(isrc >= begin && isrc < end);
+          data[dst++] = data[isrc];
+        }
+        k0 = k1;
+      }
+    });
+
+    return begin+sused;
+  }
+}
--- a/thirdparty/embree/common/algorithms/parallel_for.h
+++ b/thirdparty/embree/common/algorithms/parallel_for.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../tasking/taskscheduler.h"
+#include "../sys/array.h"
+#include "../math/emath.h"
+#include "../math/range.h"
+
+namespace embree
+{
+  /* parallel_for without range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index N, const Func& func)
+  {
+#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
+    if (N) {
+      TaskScheduler::TaskGroupContext context;
+      TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
+          assert(r.size() == 1);
+          func(r.begin());
+        },&context);
+      TaskScheduler::wait();
+      if (context.cancellingException != nullptr) {
+        std::rethrow_exception(context.cancellingException);
+      }
+    }
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      },context);
+    if (context.is_group_execution_cancelled())
+      throw std::runtime_error("task cancelled");
+  #else
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      });
+    if (tbb::task::self().is_cancelled())
+      throw std::runtime_error("task cancelled");
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { 
+        func(i);
+      });
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range and granulatity */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
+  {
+    assert(first <= last);
+#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
+    TaskScheduler::TaskGroupContext context;
+    TaskScheduler::spawn(first,last,minStepSize,func,&context);
+    TaskScheduler::wait();
+    if (context.cancellingException != nullptr) {
+      std::rethrow_exception(context.cancellingException);
+    }
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      },context);
+    if (context.is_group_execution_cancelled())
+      throw std::runtime_error("task cancelled");
+  #else
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      });
+    if (tbb::task::self().is_cancelled())
+      throw std::runtime_error("task cancelled");
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { 
+        func(range<Index>(i,i+1)); 
+      });
+
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Func& func)
+  {
+    assert(first <= last);
+    parallel_for(first,last,(Index)1,func);
+  }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner(),context);
+      if (context.is_group_execution_cancelled())
+        throw std::runtime_error("task cancelled");
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner());
+      if (tbb::task::self().is_cancelled())
+        throw std::runtime_error("task cancelled");
+    #endif
+  }
+
+  typedef tbb::affinity_partitioner affinity_partitioner;
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap,context);
+      if (context.is_group_execution_cancelled())
+        throw std::runtime_error("task cancelled");
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap);
+      if (tbb::task::self().is_cancelled())
+        throw std::runtime_error("task cancelled");
+    #endif
+  }
+
+#else
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func) 
+  {
+    parallel_for(N,func);
+  }
+
+  struct affinity_partitioner {
+  };
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) 
+  {
+    parallel_for(N,func);
+  }
+
+#endif
+}
--- a/thirdparty/embree/common/algorithms/parallel_for_for.h
+++ b/thirdparty/embree/common/algorithms/parallel_for_for.h
@@ -0,0 +1,160 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename ArrayArray, typename Func>
+    __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) 
+  {
+    size_t k=0;
+    for (size_t i=0; i!=array2.size(); ++i) {
+      const size_t N = array2[i]->size();
+      if (N) func(array2[i],range<size_t>(0,N),k);
+      k+=N;
+    }
+  }
+
+  class ParallelForForState
+  {
+  public:
+
+    enum { MAX_TASKS = 64 };
+
+    __forceinline ParallelForForState () 
+      : taskCount(0) {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
+      init(array2,minStepSize);
+    }
+
+    template<typename SizeFunc>
+    __forceinline ParallelForForState (const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize) {
+      init(numArrays,getSize,minStepSize);
+    } 
+
+    template<typename SizeFunc>
+    __forceinline void init ( const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize )
+    {
+      /* first calculate total number of elements */
+      size_t N = 0;
+      for (size_t i=0; i<numArrays; i++) {
+	N += getSize(i);
+      }
+      this->N = N;
+
+      /* calculate number of tasks to use */
+      const size_t numThreads = TaskScheduler::threadCount();
+      const size_t numBlocks  = (N+minStepSize-1)/minStepSize;
+      taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
+      
+      /* calculate start (i,j) for each task */
+      size_t taskIndex = 0;
+      i0[taskIndex] = 0;
+      j0[taskIndex] = 0;
+      size_t k0 = (++taskIndex)*N/taskCount;
+      for (size_t i=0, k=0; taskIndex < taskCount; i++) 
+      {
+	assert(i<numArrays);
+	size_t j=0, M = getSize(i);
+	while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
+	  assert(taskIndex<taskCount);
+	  i0[taskIndex] = i;
+	  j0[taskIndex] = j += k0-k;
+	  k=k0;
+	  k0 = (++taskIndex)*N/taskCount;
+	}
+	k+=M-j;
+      }
+    }
+
+    template<typename ArrayArray>
+      __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
+    {
+      init(array2.size(),[&](size_t i) { return array2[i] ? array2[i]->size() : 0; },minStepSize);
+    }
+    
+    __forceinline size_t size() const {
+      return N;
+    }
+    
+  public:
+    size_t i0[MAX_TASKS];
+    size_t j0[MAX_TASKS];
+    size_t taskCount;
+    size_t N;
+  };
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
+  {
+    ParallelForForState state(array2,minStepSize);
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
+        k+=r1-r0; j0 = 0;
+      }
+    });
+  }
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
+  {
+    parallel_for_for(array2,1,func);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    ParallelForForState state(array2,minStepSize);
+    Value temp[ParallelForForState::MAX_TASKS];
+
+    for (size_t i=0; i<state.taskCount; i++)
+      temp[i] = identity;
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
+        k+=r1-r0; j0 = 0;
+      }
+    });
+
+    Value ret = identity;
+    for (size_t i=0; i<state.taskCount; i++)
+      ret = reduction(ret,temp[i]);
+    return ret;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_reduce(array2,1,identity,func,reduction);
+  }
+}
--- a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
+++ b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
@@ -0,0 +1,142 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for_for.h"
+#include "parallel_prefix_sum.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelForForPrefixSumState : public ParallelForForState
+  {
+    __forceinline ParallelForForPrefixSumState () {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
+      : ParallelForForState(array2,minStepSize) {}
+
+    template<typename SizeFunc>
+    __forceinline ParallelForForPrefixSumState (size_t numArrays, const SizeFunc& getSize, const size_t minStepSize)
+      : ParallelForForState(numArrays,getSize,minStepSize) {}
+
+    ParallelPrefixSumState<Value> prefix_state;
+  };
+  
+  template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0_( ParallelForForPrefixSumState<Value>& state, Index minStepSize, 
+                                                       const SizeFunc& getSize, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = getSize(i);
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1_( ParallelForForPrefixSumState<Value>& state, Index minStepSize, 
+                                                       const SizeFunc& getSize, 
+                                                       const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = getSize(i);
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k,reduction(state.prefix_state.sums[taskIndex],N)));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+  __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state,
+                                                    ArrayArray& array2, Index minStepSize, 
+                                                    const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum0_(state,minStepSize,
+                                        [&](Index i) { return array2[i] ? array2[i]->size() : 0; },
+                                        identity,
+                                        [&](Index i, const range<Index>& r, Index k) { return func(array2[i], r, k, i); },
+                                        reduction);
+  }
+
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+  __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state,
+                                                    ArrayArray& array2, Index minStepSize, 
+                                                    const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum1_(state,minStepSize,
+                                        [&](Index i) { return array2[i] ? array2[i]->size() : 0; },
+                                        identity,
+                                        [&](Index i, const range<Index>& r, Index k, const Value& base) { return func(array2[i], r, k, i, base); },
+                                        reduction);
+  }                                       
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
+  }
+}
--- a/thirdparty/embree/common/algorithms/parallel_map.h
+++ b/thirdparty/embree/common/algorithms/parallel_map.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /*! implementation of a key/value map with parallel construction */
+  template<typename Key, typename Val>
+  class parallel_map
+  {
+    /* key/value pair to build the map */
+    struct KeyValue
+    {
+      __forceinline KeyValue () {}
+
+      __forceinline KeyValue (const Key key, const Val val)
+	: key(key), val(val) {}
+
+      __forceinline operator Key() const {
+	return key;
+      }
+
+    public:
+      Key key;
+      Val val;
+    };
+
+  public:
+    
+    /*! parallel map constructors */
+    parallel_map () {}
+
+    /*! construction from pair of vectors */
+    template<typename KeyVector, typename ValVector>
+      parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
+
+    /*! initialized the parallel map from a vector with keys and values */
+    template<typename KeyVector, typename ValVector>
+      void init(const KeyVector& keys, const ValVector& values) 
+    {
+      /* reserve sufficient space for all data */
+      assert(keys.size() == values.size());
+      vec.resize(keys.size());
+      
+      /* generate key/value pairs */
+      parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++)
+	  vec[i] = KeyValue((Key)keys[i],values[i]);
+      });
+
+      /* perform parallel radix sort of the key/value pairs */
+      std::vector<KeyValue> temp(keys.size());
+      radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
+    }
+
+    /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
+    __forceinline const Val* lookup(const Key& key) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return nullptr;
+      if (i->key != key) return nullptr;
+      return &i->val;
+    }
+
+    /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
+    __forceinline Val lookup(const Key& key, const Val& def) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return def;
+      if (i->key != key) return def;
+      return i->val;
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<KeyValue> vec;    //!< vector containing sorted elements
+  };
+}
--- a/thirdparty/embree/common/algorithms/parallel_partition.h
+++ b/thirdparty/embree/common/algorithms/parallel_partition.h
@@ -0,0 +1,283 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+#include "../math/range.h"
+
+namespace embree
+{
+  /* serial partitioning */
+  template<typename T, typename V, typename IsLeft, typename Reduction_T>
+    __forceinline size_t serial_partitioning(T* array, 
+                                             const size_t begin,
+                                             const size_t end, 
+                                             V& leftReduction,
+                                             V& rightReduction,
+                                             const IsLeft& is_left, 
+                                             const Reduction_T& reduction_t)
+  {
+    T* l = array + begin;
+    T* r = array + end - 1;
+    
+    while(1)
+    {
+      /* *l < pivot */
+      while (likely(l <= r && is_left(*l) )) 
+      {
+        //prefetchw(l+4); // FIXME: enable?
+        reduction_t(leftReduction,*l);
+        ++l;
+      }
+      /* *r >= pivot) */
+      while (likely(l <= r && !is_left(*r)))
+      {
+        //prefetchw(r-4); FIXME: enable?
+        reduction_t(rightReduction,*r);
+        --r;
+      }
+      if (r<l) break;
+      
+      reduction_t(leftReduction ,*r);
+      reduction_t(rightReduction,*l);
+      xchg(*l,*r);
+      l++; r--;
+    }
+    
+    return l - array;        
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    class __aligned(64) parallel_partition_task
+  {
+    ALIGNED_CLASS_(64);
+  private:
+
+    static const size_t MAX_TASKS = 64;
+
+    T* array;
+    size_t N;
+    const IsLeft& is_left;
+    const Reduction_T& reduction_t;
+    const Reduction_V& reduction_v;
+    const Vi& identity;
+
+    size_t numTasks; 
+    __aligned(64) size_t counter_start[MAX_TASKS+1]; 
+    __aligned(64) size_t counter_left[MAX_TASKS+1];  
+    __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];  
+    __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; 
+    __aligned(64) V leftReductions[MAX_TASKS];           
+    __aligned(64) V rightReductions[MAX_TASKS];    
+
+  public:
+     
+    __forceinline parallel_partition_task(T* array, 
+                                          const size_t N, 
+                                          const Vi& identity, 
+                                          const IsLeft& is_left, 
+                                          const Reduction_T& reduction_t, 
+                                          const Reduction_V& reduction_v,
+                                          const size_t BLOCK_SIZE) 
+
+      : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
+      numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
+
+    __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
+    {
+      size_t i = 0;
+      while(index >= (size_t)r[i].size())
+      {
+        assert(i < numRanges);
+        index -= (size_t)r[i].size();
+        i++;
+      }	    
+      return &r[i];
+    }
+
+    __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
+                                                  const size_t numRightMisplacedRanges,
+                                                  const size_t startID,
+                                                  const size_t endID)
+    {
+      size_t leftLocalIndex  = startID;
+      size_t rightLocalIndex = startID;
+      const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
+      const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
+      
+      size_t l_left = l_range->size() - leftLocalIndex;
+      size_t r_left = r_range->size() - rightLocalIndex;
+      T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
+      T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
+      size_t size = endID - startID;
+      size_t items = min(size,min(l_left,r_left)); 
+     
+      while (size)
+      {
+        if (unlikely(l_left == 0))
+        {
+          l_range++;
+          l_left = l_range->size();
+          l = &array[l_range->begin()];
+          items = min(size,min(l_left,r_left));
+        }
+
+        if (unlikely(r_left == 0))
+        {		
+          r_range++;
+          r_left = r_range->size();
+          r = &array[r_range->begin()];          
+          items = min(size,min(l_left,r_left));
+        }
+
+        size   -= items;
+        l_left -= items;
+        r_left -= items;
+
+        while(items) {
+          items--;
+          xchg(*l++,*r++);
+        }
+      }
+    }
+
+    __forceinline size_t partition(V& leftReduction, V& rightReduction)
+    {
+      /* partition the individual ranges for each task */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*N/numTasks;
+          const size_t endID   = (taskID+1)*N/numTasks;
+          V local_left(identity);
+          V local_right(identity);
+          const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
+          counter_start[taskID] = startID;
+          counter_left [taskID] = mid-startID;
+          leftReductions[taskID]  = local_left;
+          rightReductions[taskID] = local_right;
+        });
+      counter_start[numTasks] = N;
+      counter_left[numTasks]  = 0;
+      
+      /* finalize the reductions */
+      for (size_t i=0; i<numTasks; i++) {
+        reduction_v(leftReduction,leftReductions[i]);
+        reduction_v(rightReduction,rightReductions[i]);
+      }
+
+      /* calculate mid point for partitioning */
+      size_t mid = counter_left[0];
+      for (size_t i=1; i<numTasks; i++)
+        mid += counter_left[i];
+      const range<ssize_t> globalLeft (0,mid);
+      const range<ssize_t> globalRight(mid,N);
+
+      /* calculate all left and right ranges that are on the wrong global side */
+      size_t numMisplacedRangesLeft  = 0;
+      size_t numMisplacedRangesRight = 0;
+      size_t numMisplacedItemsLeft  MAYBE_UNUSED = 0;
+      size_t numMisplacedItemsRight MAYBE_UNUSED = 0;
+
+      for (size_t i=0; i<numTasks; i++)
+      {	    
+        const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
+        const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
+        const range<ssize_t> left_misplaced  = globalLeft. intersect(right_range);
+        const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
+
+        if (!left_misplaced.empty())  
+        {
+          numMisplacedItemsLeft += left_misplaced.size();
+          leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
+        }
+
+        if (!right_misplaced.empty()) 
+        {
+          numMisplacedItemsRight += right_misplaced.size();
+          rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
+        }
+      }
+      assert( numMisplacedItemsLeft == numMisplacedItemsRight );
+
+      /* if no items are misplaced we are done */
+      if (numMisplacedItemsLeft == 0)
+        return mid;
+
+      /* otherwise we copy the items to the right place in parallel */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
+          const size_t endID   = (taskID+1)*numMisplacedItemsLeft/numTasks;
+          swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);	                             
+        });
+
+      return mid;
+    }
+  };
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE = 128)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < BLOCK_SIZE))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE,
+                                            size_t PARALLEL_THRESHOLD)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < PARALLEL_THRESHOLD))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+
+  template<typename T, typename IsLeft>
+    inline size_t parallel_partitioning(T* array, 
+                                        const size_t begin,
+                                        const size_t end, 
+                                        const IsLeft& is_left, 
+                                        size_t BLOCK_SIZE = 128)
+  {
+    size_t leftReduction = 0;
+    size_t rightReduction = 0;
+    return parallel_partitioning(
+      array,begin,end,0,leftReduction,rightReduction,is_left,
+      [] (size_t& t,const T& ref) {  },
+      [] (size_t& t0,size_t& t1) { },
+      BLOCK_SIZE);
+  }
+
+}
--- a/thirdparty/embree/common/algorithms/parallel_prefix_sum.h
+++ b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelPrefixSumState 
+  {
+    enum { MAX_TASKS = 64 };
+    Value counts[MAX_TASKS];
+    Value sums  [MAX_TASKS];
+  };
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t numThreads = TaskScheduler::threadCount();
+    const size_t numBlocks  = (last-first+minStepSize-1)/minStepSize;
+    const size_t taskCount  = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
+
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
+      const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
+      state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++) 
+    {
+      const Value c = state.counts[i];
+      state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  /*! parallel calculation of prefix sums */
+  template<typename SrcArray, typename DstArray, typename Value, typename Add>
+    __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) 
+  {
+    /* perform single threaded prefix operation for small N */
+    if (N < SINGLE_THREAD_THRESHOLD) 
+    {
+      Value sum=identity;
+      for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
+      return sum;
+    }
+    
+    /* perform parallel prefix operation for large N */
+    else 
+    {
+      ParallelPrefixSumState<Value> state;
+      
+      /* initial run just sets up start values for subtasks */
+      parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
+          return s;
+          
+        }, add);
+      
+      /* final run calculates prefix sum */
+      return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            dst[i] = add(sum,s);
+            s = add(s,src[i]);
+          }
+          return s;
+          
+        }, add);
+    }
+  }
+}
--- a/thirdparty/embree/common/algorithms/parallel_reduce.h
+++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
@@ -0,0 +1,146 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    const Index maxTasks = 512;
+    const Index threadCount = (Index) TaskScheduler::threadCount();
+    taskCount = min(taskCount,threadCount,maxTasks);
+
+    /* parallel invocation of all tasks */
+    dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
+    parallel_for(taskCount, [&](const Index taskIndex) {
+        const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
+        const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
+        values[taskIndex] = func(range<Index>(k0,k1));
+      });
+
+    /* perform reduction over all tasks */
+    Value v = identity;
+    for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
+    return v;
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
+
+    /* fast path for small number of iterations */
+    Index taskCount = (last-first+minStepSize-1)/minStepSize;
+    if (likely(taskCount == 1)) {
+      return func(range<Index>(first,last));
+    }
+    return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction,context);
+    //if (context.is_group_execution_cancelled())
+    //  throw std::runtime_error("task cancelled");
+    return v;
+  #else
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction);
+    //if (tbb::task::self().is_cancelled())
+    //  throw std::runtime_error("task cancelled");
+    return v;
+  #endif
+#else // TASKING_PPL
+    struct AlignedValue
+    {
+      char storage[__alignof(Value)+sizeof(Value)];
+      static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
+      Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
+      AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
+      AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
+      AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      operator Value() const { return *getValuePtr(); }
+    };
+    
+    struct Iterator_Index
+    {
+      Index v;
+      typedef std::forward_iterator_tag iterator_category;
+      typedef AlignedValue value_type;
+      typedef Index difference_type;
+      typedef Index distance_type;
+      typedef AlignedValue* pointer;
+      typedef AlignedValue& reference;
+      __forceinline Iterator_Index() {}
+      __forceinline Iterator_Index(Index v) : v(v) {}
+      __forceinline bool operator== (Iterator_Index other) { return v == other.v; }
+      __forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
+      __forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
+      __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
+    };
+    
+    auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
+      assert(begin.v < end.v);
+      return reduction(start, func(range<Index>(begin.v, end.v)));
+    };
+    const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
+    return v;
+#endif
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    if (likely(last-first < parallel_threshold)) {
+      return func(range<Index>(first,last)); 
+    } else {
+      return parallel_reduce(first,last,minStepSize,identity,func,reduction);
+    }
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    auto funcr = [&] ( const range<Index> r ) {
+      Value v = identity;
+      for (Index i=r.begin(); i<r.end(); i++)
+        v = reduction(v,func(i));
+      return v;
+    };
+    return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
+  }
+}
--- a/thirdparty/embree/common/algorithms/parallel_set.h
+++ b/thirdparty/embree/common/algorithms/parallel_set.h
@@ -0,0 +1,52 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /* implementation of a set of values with parallel construction */
+  template<typename T>
+  class parallel_set
+  {
+  public:
+
+    /*! default constructor for the parallel set */
+    parallel_set () {}
+
+    /*! construction from vector */
+    template<typename Vector>
+      parallel_set (const Vector& in) { init(in); }
+
+    /*! initialized the parallel set from a vector */
+    template<typename Vector>
+      void init(const Vector& in) 
+    {
+      /* copy data to internal vector */
+      vec.resize(in.size());
+      parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++) 
+	  vec[i] = in[i];
+      });
+
+      /* sort the data */
+      std::vector<T> temp(in.size());
+      radix_sort<T>(vec.data(),temp.data(),vec.size());
+    }
+
+    /*! tests if some element is in the set */
+    __forceinline bool lookup(const T& elt) const {
+      return std::binary_search(vec.begin(), vec.end(), elt);
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<T> vec;   //!< vector containing sorted elements
+  };
+}
--- a/thirdparty/embree/common/algorithms/parallel_sort.h
+++ b/thirdparty/embree/common/algorithms/parallel_sort.h
@@ -0,0 +1,454 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../simd/simd.h"
+#include "parallel_for.h"
+#include <algorithm>
+
+namespace embree
+{
+  template<class T>
+    __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v < array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T>
+    __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v > array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T> 
+    void quicksort_ascending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] > pivotvalue);
+        while (t[++left] < pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_ascending(t, begin, pivot);
+      quicksort_ascending(t, pivot + 1, end);
+    }
+  }
+  
+  template<class T> 
+    void quicksort_decending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] < pivotvalue);
+        while (t[++left] > pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_decending(t, begin, pivot);
+      quicksort_decending(t, pivot + 1, end);
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_ascending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_ascending<T>(&t[begin],size);
+      }
+      else
+      {
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] > pivotvalue);
+          while (t[++left] < pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_decending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_decending<T>(&t[begin],size);
+      }
+      else
+      {
+        
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] < pivotvalue);
+          while (t[++left] > pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  template<typename T>
+    static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
+  {
+    static const unsigned int BITS = 8;
+    static const unsigned int BUCKETS = (1 << BITS);
+    static const unsigned int CMP_SORT_THRESHOLD = 16;
+    
+    __aligned(64) unsigned int count[BUCKETS];
+    
+    /* clear buckets */
+    for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
+    
+    /* count buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+    for (size_t i=0;i<num;i++)
+      count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
+    
+    /* prefix sums */
+    __aligned(64) unsigned int head[BUCKETS];
+    __aligned(64) unsigned int tail[BUCKETS];
+    
+    head[0] = 0;
+    for (size_t i=1; i<BUCKETS; i++)    
+      head[i] = head[i-1] + count[i-1];
+    
+    for (size_t i=0; i<BUCKETS-1; i++)    
+      tail[i] = head[i+1];
+    
+    tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
+    
+    assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);      
+    assert(tail[BUCKETS-1] == num);      
+    
+    /* in-place swap */      
+    for (size_t i=0;i<BUCKETS;i++)
+    {
+      /* process bucket */
+      while(head[i] < tail[i])
+      {
+        T v = morton[head[i]];
+        while(1)
+        {
+          const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
+          if (b == i) break;
+          std::swap(v,morton[head[b]++]);
+        }
+        assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
+        morton[head[i]++] = v;
+      }
+    }
+    if (shift == 0) return;
+    
+    size_t offset = 0;
+    for (size_t i=0;i<BUCKETS;i++)
+      if (count[i])
+      {
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
+        
+        if (unlikely(count[i] < CMP_SORT_THRESHOLD))
+          insertionsort_ascending(morton + offset, count[i]);
+        else
+          radixsort32(morton + offset, count[i], shift-BITS);
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(morton[j] <= morton[j+1]);
+        
+        offset += count[i];
+      }      
+  }    
+
+  template<typename Ty, typename Key>
+    class ParallelRadixSort
+  {
+    static const size_t MAX_TASKS = 64;
+    static const size_t BITS = 8;
+    static const size_t BUCKETS = (1 << BITS);
+    typedef unsigned int TyRadixCount[BUCKETS];
+    
+    template<typename T>
+      static bool compare(const T& v0, const T& v1) {
+      return (Key)v0 < (Key)v1;
+    }
+
+  private:
+    ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
+    ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
+
+    
+  public:
+    ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
+      : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
+
+    void sort(const size_t blockSize)
+    {
+      assert(blockSize > 0);
+      
+      /* perform single threaded sort for small N */
+      if (N<=blockSize) // handles also special case of 0!
+      {	  
+        /* do inplace sort inside destination array */
+        std::sort(src,src+N,compare<Ty>);
+      }
+      
+      /* perform parallel sort for large N */
+      else 
+      {
+        const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
+        tbbRadixSort(numThreads);
+      }
+    }
+
+    ~ParallelRadixSort()
+    {
+      alignedFree(radixCount); 
+      radixCount = nullptr;
+    }
+    
+  private:
+    
+    void tbbRadixIteration0(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* count how many items go into the buckets */
+      for (size_t i=0; i<BUCKETS; i++)
+        radixCount[threadIndex][i] = 0;
+
+      /* iterate over src array and count buckets */
+      unsigned int * __restrict const count = radixCount[threadIndex];
+#if defined(__INTEL_COMPILER)
+#pragma nounroll      
+#endif
+      for (size_t i=startID; i<endID; i++) {
+#if defined(__64BIT__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const Key index = ((Key)src[i] >> shift) & mask;
+#endif
+        count[index]++;
+      }
+    }
+    
+    void tbbRadixIteration1(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* calculate total number of items for each bucket */
+      __aligned(64) unsigned int total[BUCKETS];
+      /*
+      for (size_t i=0; i<BUCKETS; i++)
+        total[i] = 0;
+      */
+      for (size_t i=0; i<BUCKETS; i+=VSIZEX)
+        vintx::store(&total[i], zero);
+      
+      for (size_t i=0; i<threadCount; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          total[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* calculate start offset of each bucket */
+      __aligned(64) unsigned int offset[BUCKETS];
+      offset[0] = 0;
+      for (size_t i=1; i<BUCKETS; i++)    
+        offset[i] = offset[i-1] + total[i-1];
+      
+      /* calculate start offset of each bucket for this thread */
+      for (size_t i=0; i<threadIndex; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          offset[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* copy items into their buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+      for (size_t i=startID; i<endID; i++) {
+        const Ty elt = src[i];
+#if defined(__64BIT__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const size_t index = ((Key)src[i] >> shift) & mask;
+#endif
+        dst[offset[index]++] = elt;
+      }
+    }
+    
+    void tbbRadixIteration(const Key shift, const bool last,
+                           const Ty* __restrict src, Ty* __restrict dst,
+                           const size_t numTasks)
+    {
+      affinity_partitioner ap;
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
+    }
+    
+    void tbbRadixSort(const size_t numTasks)
+    {
+      radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
+      
+      if (sizeof(Key) == sizeof(uint32_t)) {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
+      }
+      else if (sizeof(Key) == sizeof(uint64_t))
+      {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
+      }
+    }
+    
+  private:
+    TyRadixCount* radixCount;
+    Ty* const src;
+    Ty* const tmp;
+    const size_t N;
+  };
+
+  template<typename Ty>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty, typename Key>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
+  }
+}
--- a/thirdparty/embree/common/lexers/parsestream.h
+++ b/thirdparty/embree/common/lexers/parsestream.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stringstream.h"
+#include "../sys/filename.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/col3.h"
+#include "../math/color.h"
+
+namespace embree
+{
+  /*! helper class for simple command line parsing */
+  class ParseStream : public Stream<std::string>
+  {
+  public:
+    ParseStream (const Ref<Stream<std::string> >& cin) : cin(cin) {}
+
+    ParseStream (const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false)
+      : cin(new StringStream(cin,seps,endl,multiLine)) {}
+
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next() { return cin->get(); }
+
+    void force(const std::string& next) {
+      std::string token = getString();
+      if (token != next)
+        THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found");
+    }
+
+    std::string getString() {
+      return get();
+    }
+
+    FileName getFileName()  {
+      return FileName(get());
+    }
+
+    int   getInt  () {
+      return atoi(get().c_str());
+    }
+
+    Vec2i getVec2i() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      return Vec2i(x,y);
+    }
+
+    Vec3ia getVec3ia() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      int z = atoi(get().c_str());
+      return Vec3ia(x,y,z);
+    }
+
+    float getFloat() {
+      return (float)atof(get().c_str());
+    }
+
+    Vec2f getVec2f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      return Vec2f(x,y);
+    }
+
+    Vec3f getVec3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3f(x,y,z);
+    }
+
+    Vec3fa getVec3fa() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3fa(x,y,z);
+    }
+
+    Col3f getCol3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Col3f(x,y,z);
+    }
+
+    Color getColor() {
+      float r = (float)atof(get().c_str());
+      float g = (float)atof(get().c_str());
+      float b = (float)atof(get().c_str());
+      return Color(r,g,b);
+    }
+
+  private:
+    Ref<Stream<std::string> > cin;
+  };
+}
--- a/thirdparty/embree/common/lexers/stream.h
+++ b/thirdparty/embree/common/lexers/stream.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/ref.h"
+#include "../sys/filename.h"
+#include "../sys/estring.h"
+
+#include <vector>
+#include <iostream>
+#include <cstdio>
+#include <string.h>
+
+namespace embree
+{
+  /*! stores the location of a stream element in the source */
+  class ParseLocation
+  {
+  public:
+    ParseLocation () : lineNumber(-1), colNumber(-1) {}
+    ParseLocation (std::shared_ptr<std::string> fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/)
+      : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {}
+
+    std::string str() const
+    {
+      std::string str = "unknown";
+      if (fileName) str = *fileName;
+      if (lineNumber >= 0) str += " line " + toString(lineNumber);
+      if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber);
+      return str;
+    }
+
+  private:
+    std::shared_ptr<std::string> fileName;         /// name of the file (or stream) the token is from
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+  };
+
+  /*! a stream class templated over the stream elements */
+  template<typename T> class Stream : public RefCount
+  {
+    enum { BUF_SIZE = 1024 };
+    
+  private:
+    virtual T next() = 0;
+    virtual ParseLocation location() = 0;
+    __forceinline std::pair<T,ParseLocation> nextHelper() {
+      ParseLocation l = location();
+      T v = next();
+      return std::pair<T,ParseLocation>(v,l);
+    }
+    __forceinline void push_back(const std::pair<T,ParseLocation>& v) {
+      if (past+future == BUF_SIZE) pop_front();
+      size_t end = (start+past+future++)%BUF_SIZE;
+      buffer[end] = v;
+    }
+    __forceinline void pop_front() {
+      if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty");
+      start = (start+1)%BUF_SIZE; past--;
+    }
+  public:
+    Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {}
+    virtual ~Stream() {}
+    
+  public:
+    
+    const ParseLocation& loc() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].second;
+    }
+    T get() {
+      if (future == 0) push_back(nextHelper());
+      T t = buffer[(start+past)%BUF_SIZE].first;
+      past++; future--;
+      return t;
+    }
+    const T& peek() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].first;
+    }
+    const T& unget(size_t n = 1) {
+      if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items");
+      past -= n; future += n;
+      return peek();
+    }
+    void drop() {
+      if (future == 0) push_back(nextHelper());
+      past++; future--;
+    }
+  private:
+    size_t start,past,future;
+    std::vector<std::pair<T,ParseLocation> > buffer;
+  };
+  
+  /*! warps an iostream stream */
+  class StdStream : public Stream<int>
+  {
+  public:
+    StdStream (std::istream& cin, const std::string& name = "std::stream")
+      : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+    ~StdStream() {}
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+    int next() {
+      int c = cin.get();
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+  private:
+    std::istream& cin;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a file */
+  class FileStream : public Stream<int>
+  {
+  public:
+    FileStream (const FileName& fileName)
+      : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str())))
+    {
+      if (ifs) ifs.close();
+      ifs.open(fileName.str());
+      if (!ifs.is_open()) THROW_RUNTIME_ERROR("cannot open file " + fileName.str());
+    }
+    ~FileStream() { 
+      if (ifs) ifs.close();
+    }
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = ifs.get();
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+
+  private:
+    std::ifstream ifs;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a string */
+  class StrStream : public Stream<int>
+  {
+  public:
+
+    StrStream (const char* str)
+      : str(str), lineNumber(1), colNumber(0), charNumber(0) {}
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(std::shared_ptr<std::string>(),lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = str[charNumber];
+      if (c == 0) return EOF;
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    const char* str;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+  };
+
+  /*! creates a character stream from a command line */
+  class CommandLineStream : public Stream<int>
+  {
+  public:
+    CommandLineStream (int argc, char** argv, const std::string& name = "command line")
+      : i(0), j(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name)))
+    {
+      if (argc > 0) {
+	for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++;
+	charNumber++;
+      }
+      for (ssize_t k=1; k<argc; k++) args.push_back(argv[k]);
+    }
+    ~CommandLineStream() {}
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,0,charNumber,charNumber);
+    }
+    int next() {
+      if (i == args.size()) return EOF;
+      if (j == args[i].size()) { i++; j=0; charNumber++; return ' '; }
+      charNumber++;
+      return args[i][j++];
+    }
+  private:
+    size_t i,j;
+    std::vector<std::string> args;
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+}
--- a/thirdparty/embree/common/lexers/streamfilters.h
+++ b/thirdparty/embree/common/lexers/streamfilters.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /* removes all line comments from a stream */
+  class LineCommentFilter : public Stream<int>
+  {
+  public:
+    LineCommentFilter (const FileName& fileName, const std::string& lineComment)
+      : cin(new FileStream(fileName)), lineComment(lineComment) {}
+    LineCommentFilter (Ref<Stream<int> > cin, const std::string& lineComment)
+      : cin(cin), lineComment(lineComment) {}
+
+    ParseLocation location() { return cin->loc(); }
+
+    int next()
+    {
+      /* look if the line comment starts here */
+      for (size_t j=0; j<lineComment.size(); j++) {
+        if (cin->peek() != lineComment[j]) { cin->unget(j); goto not_found; }
+        cin->get();
+      }
+      /* eat all characters until the end of the line (or file) */
+      while (cin->peek() != '\n' && cin->peek() != EOF) cin->get();
+
+    not_found:
+      return cin->get();
+    }
+
+  private:
+    Ref<Stream<int> > cin;
+    std::string lineComment;
+  };
+}
--- a/thirdparty/embree/common/lexers/stringstream.cpp
+++ b/thirdparty/embree/common/lexers/stringstream.cpp
@@ -0,0 +1,48 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stringstream.h"
+
+namespace embree
+{
+  static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+  
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* simple tokenizer */
+  StringStream::StringStream(const Ref<Stream<int> >& cin, const std::string& seps, const std::string& endl, bool multiLine)
+    : cin(cin), endl(endl), multiLine(multiLine)
+  {
+    createCharMap(isSepMap,seps);
+    createCharMap(isValidCharMap,stringChars);
+  }
+
+  std::string StringStream::next()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF) {
+      if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; }
+      if (multiLine && cin->peek() == '\\') {
+        cin->drop();
+        if (cin->peek() == '\n') { cin->drop(); continue; }
+        cin->unget();
+      }
+      if (!isSeparator(cin->peek())) break;
+      cin->drop();
+    }
+
+    /* parse everything until the next separator */
+    std::vector<char> str; str.reserve(64);
+    while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+      int c = cin->get();
+      if (!isValidChar(c)) abort(); //throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+      str.push_back((char)c);
+    }
+    str.push_back(0);
+    return std::string(str.data());
+  }
+}
--- a/thirdparty/embree/common/lexers/stringstream.h
+++ b/thirdparty/embree/common/lexers/stringstream.h
@@ -0,0 +1,29 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /*! simple tokenizer that produces a string stream */
+  class StringStream : public Stream<std::string>
+  {
+  public:
+    StringStream(const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false);
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next();
+  private:
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+    __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; }
+  private:
+    Ref<Stream<int> > cin; /*! source character stream */
+    bool isSepMap[256];    /*! map for fast classification of separators */
+    bool isValidCharMap[256];  /*! map for valid characters */
+    std::string endl;      /*! the token of the end of line */
+    bool multiLine;        /*! whether to parse lines wrapped with \ */
+  };
+}
--- a/thirdparty/embree/common/lexers/tokenstream.cpp
+++ b/thirdparty/embree/common/lexers/tokenstream.cpp
@@ -0,0 +1,181 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tokenstream.h"
+#include "../math/emath.h"
+
+namespace embree
+{
+  /* shorthands for common sets of characters */
+  const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz";
+  const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  const std::string TokenStream::numbers = "0123456789";
+  const std::string TokenStream::separators = "\n\t\r ";
+  const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* build full tokenizer that takes list of valid characters and keywords */
+  TokenStream::TokenStream(const Ref<Stream<int> >& cin,            //< stream to read from
+                                   const std::string& alpha,                //< valid characters for identifiers
+                                   const std::string& seps,                 //< characters that act as separators
+                                   const std::vector<std::string>& symbols) //< symbols
+    : cin(cin), symbols(symbols)
+  {
+    createCharMap(isAlphaMap,alpha);
+    createCharMap(isSepMap,seps);
+    createCharMap(isStringCharMap,stringChars);
+  }
+
+  bool TokenStream::decDigits(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get();
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str;
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::decDigits1(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str; else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::trySymbol(const std::string& symbol)
+  {
+    size_t pos = 0;
+    while (pos < symbol.size()) {
+      if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; }
+      cin->drop(); pos++;
+    }
+    return true;
+  }
+
+  bool TokenStream::trySymbols(Token& token, const ParseLocation& loc)
+  {
+    for (size_t i=0; i<symbols.size(); i++) {
+      if (!trySymbol(symbols[i])) continue;
+      token = Token(symbols[i],Token::TY_SYMBOL,loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryFloat(Token& token, const ParseLocation& loc)
+  {
+    bool ok = false;
+    std::string str;
+    if (trySymbol("nan")) {
+      token = Token(float(nan));
+      return true;
+    }
+    if (trySymbol("+inf")) {
+      token = Token(float(pos_inf));
+      return true;
+    }
+    if (trySymbol("-inf")) {
+      token = Token(float(neg_inf));
+      return true;
+    }
+
+    if (decDigits(str))
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        decDigits(str);
+        if (cin->peek() == 'e' || cin->peek() == 'E') {
+          str += (char)cin->get();
+          if (decDigits(str)) ok = true; // 1.[2]E2
+        }
+        else ok = true; // 1.[2]
+      }
+      else if (cin->peek() == 'e' || cin->peek() == 'E') {
+        str += (char)cin->get();
+        if (decDigits(str)) ok = true; // 1E2
+      }
+    }
+    else
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        if (decDigits(str)) {
+          if (cin->peek() == 'e' || cin->peek() == 'E') {
+            str += (char)cin->get();
+            if (decDigits(str)) ok = true; // .3E2
+          }
+          else ok = true; // .3
+        }
+      }
+    }
+    if (ok) {
+      token = Token((float)atof(str.c_str()),loc);
+    }
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::tryInt(Token& token, const ParseLocation& loc) {
+    std::string str;
+    if (decDigits(str)) {
+      token = Token(atoi(str.c_str()),loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryString(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (cin->peek() != '\"') return false;
+    cin->drop();
+    while (cin->peek() != '\"') {
+      const int c = cin->get();
+      if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str());
+      str += (char)c;
+    }
+    cin->drop();
+    token = Token(str,Token::TY_STRING,loc);
+    return true;
+  }
+
+  bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (!isAlpha(cin->peek())) return false;
+    str += (char)cin->get();
+    while (isAlphaNum(cin->peek())) str += (char)cin->get();
+    token = Token(str,Token::TY_IDENTIFIER,loc);
+    return true;
+  }
+
+  void TokenStream::skipSeparators()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF && isSeparator(cin->peek()))
+      cin->drop();
+  }
+
+  Token TokenStream::next()
+  {
+    Token token;
+    skipSeparators();
+    ParseLocation loc = cin->loc();
+    if (trySymbols   (token,loc)) return token;      /**< try to parse a symbol */
+    if (tryFloat     (token,loc)) return token;      /**< try to parse float */
+    if (tryInt       (token,loc)) return token;      /**< try to parse integer */
+    if (tryString    (token,loc)) return token;      /**< try to parse string */
+    if (tryIdentifier(token,loc)) return token;      /**< try to parse identifier */
+    if (cin->peek() == EOF  )     return Token(loc); /**< return EOF token */
+    return Token((char)cin->get(),loc);              /**< return invalid character token */
+  }
+}
--- a/thirdparty/embree/common/lexers/tokenstream.h
+++ b/thirdparty/embree/common/lexers/tokenstream.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+#include <string>
+#include <vector>
+
+namespace embree
+{
+  /*! token class */
+  class Token
+  {
+  public:
+
+    enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL };
+
+    Token (        const ParseLocation& loc = ParseLocation()) : ty(TY_EOF  ),       loc(loc) {}
+    Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {}
+    Token (int i,  const ParseLocation& loc = ParseLocation()) : ty(TY_INT  ), i(i), loc(loc) {}
+    Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {}
+    Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty),   str(str), loc(loc) {}
+
+    static Token Eof()                { return Token(); }
+    static Token Sym(std::string str) { return Token(str,TY_SYMBOL); }
+    static Token Str(std::string str) { return Token(str,TY_STRING); }
+    static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); }
+
+    char Char() const {
+      if (ty == TY_CHAR) return c;
+      THROW_RUNTIME_ERROR(loc.str()+": character expected");
+    }
+
+    int Int() const {
+      if (ty == TY_INT) return i;
+      THROW_RUNTIME_ERROR(loc.str()+": integer expected");
+    }
+
+    float Float(bool cast = true)  const {
+      if (ty == TY_FLOAT) return f;
+      if (ty == TY_INT && cast) return (float)i;
+      THROW_RUNTIME_ERROR(loc.str()+": float expected");
+    }
+
+    std::string Identifier() const {
+      if (ty == TY_IDENTIFIER) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": identifier expected");
+    }
+
+    std::string String() const {
+      if (ty == TY_STRING) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": string expected");
+    }
+
+    std::string Symbol() const {
+      if (ty == TY_SYMBOL) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": symbol expected");
+    }
+
+    const ParseLocation& Location() const { return loc; }
+
+    friend bool operator==(const Token& a, const Token& b)
+    {
+      if (a.ty != b.ty) return false;
+      if (a.ty == TY_CHAR) return a.c == b.c;
+      if (a.ty == TY_INT) return a.i == b.i;
+      if (a.ty == TY_FLOAT) return a.f == b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str == b.str;
+      if (a.ty == TY_STRING) return a.str == b.str;
+      if (a.ty == TY_SYMBOL) return a.str == b.str;
+      return true;
+    }
+
+    friend bool operator!=(const Token& a, const Token& b) {
+      return !(a == b);
+    }
+
+    friend bool operator <( const Token& a, const Token& b ) {
+      if (a.ty != b.ty) return (int)a.ty < (int)b.ty;
+      if (a.ty == TY_CHAR) return a.c < b.c;
+      if (a.ty == TY_INT) return a.i < b.i;
+      if (a.ty == TY_FLOAT) return a.f < b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str < b.str;
+      if (a.ty == TY_STRING) return a.str < b.str;
+      if (a.ty == TY_SYMBOL) return a.str < b.str;
+      return false;
+    }
+
+    friend std::ostream& operator<<(std::ostream& cout, const Token& t)
+    {
+      if (t.ty == TY_EOF) return cout << "eof";
+      if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")";
+      if (t.ty == TY_INT) return cout << "Int(" << t.i << ")";
+      if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")";
+      if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")";
+      if (t.ty == TY_STRING) return cout << "String(" << t.str << ")";
+      if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")";
+      return cout << "unknown";
+    }
+
+  private:
+    Type ty;            //< the type of the token
+    union {
+      char c;           //< data for char tokens
+      int i;            //< data for int tokens
+      float f;          //< data for float tokens
+    };
+    std::string str;    //< data for string and identifier tokens
+    ParseLocation loc;  //< the location the token is from
+  };
+
+  /*! build full tokenizer that takes list of valid characters and keywords */
+  class TokenStream : public Stream<Token>
+  {
+  public:
+
+    /*! shorthands for common sets of characters */
+    static const std::string alpha;
+    static const std::string ALPHA;
+    static const std::string numbers;
+    static const std::string separators;
+    static const std::string stringChars;
+
+  public:
+    TokenStream(const Ref<Stream<int> >& cin,
+                const std::string& alpha, //< valid characters for identifiers
+                const std::string& seps,  //< characters that act as separators
+                const std::vector<std::string>& symbols = std::vector<std::string>()); //< symbols
+  public:
+    ParseLocation location() { return cin->loc(); }
+    Token next();
+    bool trySymbol(const std::string& symbol);
+
+  private:
+    void skipSeparators();
+    bool decDigits(std::string& str);
+    bool decDigits1(std::string& str);
+    bool trySymbols(Token& token, const ParseLocation& loc);
+    bool tryFloat(Token& token, const ParseLocation& loc);
+    bool tryInt(Token& token, const ParseLocation& loc);
+    bool tryString(Token& token, const ParseLocation& loc);
+    bool tryIdentifier(Token& token, const ParseLocation& loc);
+
+    Ref<Stream<int> > cin;
+    bool isSepMap[256];
+    bool isAlphaMap[256];
+    bool isStringCharMap[256];
+    std::vector<std::string> symbols;
+
+    /*! checks if a character is a separator */
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+
+    /*! checks if a character is a number */
+    __forceinline bool isDigit(unsigned int c) const {  return c >= '0' && c <= '9'; }
+
+    /*! checks if a character is valid inside a string */
+    __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; }
+
+    /*! checks if a character is legal for an identifier */
+    __forceinline bool isAlpha(unsigned int c) const {  return c<256 && isAlphaMap[c];  }
+    __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); }
+  };
+}
--- a/thirdparty/embree/common/math/affinespace.h
+++ b/thirdparty/embree/common/math/affinespace.h
@@ -0,0 +1,361 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linearspace2.h"
+#include "linearspace3.h"
+#include "quaternion.h"
+#include "bbox.h"
+#include "vec4.h"
+
+namespace embree
+{
+  #define VectorT typename L::Vector
+  #define ScalarT typename L::Vector::Scalar
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Affine Space
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L>
+    struct AffineSpaceT
+    {
+      L l;           /*< linear part of affine space */
+      VectorT p;     /*< affine part of affine space */
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constructors, Assignment, Cast, Copy Operations
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT           ( )                           { }
+      __forceinline AffineSpaceT           ( const AffineSpaceT& other ) { l = other.l; p = other.p; }
+      __forceinline AffineSpaceT           ( const L           & other ) { l = other  ; p = VectorT(zero); }
+      __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; }
+
+      __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {}
+      __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {}
+
+      template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {}
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constants
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {}
+      __forceinline AffineSpaceT( OneTy )  : l(one),  p(zero) {}
+
+      /*! return matrix for scaling */
+      static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); }
+
+      /*! return matrix for translation */
+      static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); }
+
+      /*! return matrix for rotation, only in 2D */
+      static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); }
+
+      /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */
+      static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); }
+
+      /*! return matrix for rotation around arbitrary axis and point, only in 3D */
+      static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p);  }
+
+      /*! return matrix for looking at given point, only in 3D */
+      static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) {
+        VectorT Z = normalize(point-eye);
+        VectorT U = normalize(cross(up,Z));
+        VectorT V = normalize(cross(Z,U));
+        return AffineSpaceT(L(U,V,Z),eye);
+      }
+
+    };
+  
+  // template specialization to get correct identity matrix for type AffineSpace3fa
+  template<>
+    __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy )  : l(one),  p(0.f, 0.f, 0.f, 1.f) {}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); }
+  template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); }
+  template<typename L> __forceinline AffineSpaceT<L>        rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); }
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT        & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT        & b ) { return a * rcp(b); }
+
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a / b; }
+
+  template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); }
+  template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); }
+  template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); }
+
+  __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b) 
+  { 
+    BBox3fa dst = empty;
+    const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0));
+    const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1));
+    const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2));
+    const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3));
+    const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4));
+    const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5));
+    const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6));
+    const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7));
+    return dst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; }
+  template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) {
+    return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) {
+    return cout << "{ l = " << m.l << ", p = " << m.p << " }";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Template Instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef AffineSpaceT<LinearSpace2f> AffineSpace2f;
+  typedef AffineSpaceT<LinearSpace3f> AffineSpace3f;
+  typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa;
+  typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx;
+  typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff;
+  typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f;
+
+  template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>>  AffineSpace3vf4;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>>  AffineSpace3vf8;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16;
+
+  template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>>  AffineSpace3vfa4;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>>  AffineSpace3vfa8;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T, typename R>
+  __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0,
+                                     const AffineSpaceT<T>& M1,
+                                     const R& t)
+  {
+    return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t));
+  }
+
+  // slerp interprets the 16 floats of the matrix M = D * R * S as components of
+  // three matrizes (D, R, S) that are interpolated individually.
+  template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>>
+  slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0,
+        const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1,
+        const T& t)
+  {
+    QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    QuaternionT<T> q = slerp(q0, q1, t);
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t);
+    AffineSpaceT<LinearSpace3<Vec3<T>>> D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q);
+    return D * R * S;
+  }
+
+  // this is a specialized version for Vec3fa because that does
+  // not play along nicely with the other templated Vec3/Vec4 types
+  __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0,
+                                     const AffineSpace3ff& M1,
+                                     const float& t)
+  {
+    Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    Quaternion3f q = slerp(q0, q1, t);
+
+    AffineSpace3fa S = lerp(M0, M1, t);
+    AffineSpace3fa D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * S;
+  }
+  
+  __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd)
+  {
+    // compute affine transform from quaternion decomposition
+    Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    AffineSpace3fa M = qd;
+    AffineSpace3fa D(one);
+    D.p.x = M.l.vx.y;
+    D.p.y = M.l.vx.z;
+    D.p.z = M.l.vy.z;
+    M.l.vx.y = 0;
+    M.l.vx.z = 0;
+    M.l.vy.z = 0;
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * M;
+  }
+  
+  __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S)
+  {
+    q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    S = qd;
+    T.x = qd.l.vx.y;
+    T.y = qd.l.vx.z;
+    T.z = qd.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+  }
+
+  __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S)
+  {
+    AffineSpace3ff M = S;
+    M.l.vx.w = q.i;
+    M.l.vy.w = q.j;
+    M.l.vz.w = q.k;
+    M.p.w    = q.r;
+    M.l.vx.y = T.x;
+    M.l.vx.z = T.y;
+    M.l.vy.z = T.z;
+    return M;
+  }
+
+  struct __aligned(16) QuaternionDecomposition
+  {
+    float scale_x = 1.f;
+    float scale_y = 1.f;
+    float scale_z = 1.f;
+    float skew_xy = 0.f;
+    float skew_xz = 0.f;
+    float skew_yz = 0.f;
+    float shift_x = 0.f;
+    float shift_y = 0.f;
+    float shift_z = 0.f;
+    float quaternion_r = 1.f;
+    float quaternion_i = 0.f;
+    float quaternion_j = 0.f;
+    float quaternion_k = 0.f;
+    float translation_x = 0.f;
+    float translation_y = 0.f;
+    float translation_z = 0.f;
+  };
+
+  __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M)
+  {
+    QuaternionDecomposition qd;
+    qd.scale_x       = M.l.vx.x;
+    qd.scale_y       = M.l.vy.y;
+    qd.scale_z       = M.l.vz.z;
+    qd.shift_x       = M.p.x;
+    qd.shift_y       = M.p.y;
+    qd.shift_z       = M.p.z;
+    qd.translation_x = M.l.vx.y;
+    qd.translation_y = M.l.vx.z;
+    qd.translation_z = M.l.vy.z;
+    qd.skew_xy       = M.l.vy.x;
+    qd.skew_xz       = M.l.vz.x;
+    qd.skew_yz       = M.l.vz.y;
+    qd.quaternion_r  = M.p.w;
+    qd.quaternion_i  = M.l.vx.w;
+    qd.quaternion_j  = M.l.vy.w;
+    qd.quaternion_k  = M.l.vz.w;
+    return qd;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /*
+   * ! Template Specialization for 2D: return matrix for rotation around point
+   * (rotation around arbitrarty vector is not meaningful in 2D)
+   */
+  template<> __forceinline
+  AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) {
+    return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Similarity Transform
+  //
+  // checks, if M is a similarity transformation, i.e if there exists a factor D
+  // such that for all x,y: distance(Mx, My) = D * distance(x, y)
+  ////////////////////////////////////////////////////////////////////////////////
+  __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D)
+  {
+    if (D) *D = 0.f;
+    if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false;
+    if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false;
+    if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false;
+
+    const float D_x = dot(M.l.vx, M.l.vx);
+    const float D_y = dot(M.l.vy, M.l.vy);
+    const float D_z = dot(M.l.vz, M.l.vz);
+
+    if (abs(D_x - D_y) > 1e-5f ||
+        abs(D_x - D_z) > 1e-5f ||
+        abs(D_y - D_z) > 1e-5f)
+      return false;
+
+    if (D) *D = sqrtf(D_x);
+    return true;
+  }
+  
+  __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr)
+  {
+    Vec3fa::storeu(&ptr->l.vx, source.l.vx);
+    Vec3fa::storeu(&ptr->l.vy, source.l.vy);
+    Vec3fa::storeu(&ptr->l.vz, source.l.vz);
+    Vec3fa::storeu(&ptr->p, source.p);
+  }
+
+  __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr)
+  {
+    AffineSpace3fa space;
+    space.l.vx = Vec3fa::loadu(&ptr->l.vx);
+    space.l.vy = Vec3fa::loadu(&ptr->l.vy);
+    space.l.vz = Vec3fa::loadu(&ptr->l.vz);
+    space.p    = Vec3fa::loadu(&ptr->p);
+    return space;
+  }
+
+  #undef VectorT
+  #undef ScalarT
+}
--- a/thirdparty/embree/common/math/bbox.h
+++ b/thirdparty/embree/common/math/bbox.h
@@ -0,0 +1,336 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+
+namespace embree
+{
+  namespace internal {
+
+    template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); }
+    template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; }
+    template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; }
+
+  } // namespace internal
+  template<typename T>
+  struct BBox
+  {
+    T lower, upper;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox           ( )                   { }
+    template<typename T1>
+    __forceinline BBox           ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {}
+    __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+    __forceinline BBox ( const T& v                     ) : lower(v), upper(v) {}
+    __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Extending Bounds
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+    __forceinline const BBox& extend(const T   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+
+    /*! tests if box is empty */
+    __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; }
+
+    /*! computes the size of the box */
+    __forceinline T size() const { return upper - lower; }
+
+    /*! computes the center of the box */
+    __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); }
+
+    /*! computes twice the center of the box */
+    __forceinline T center2() const { return lower+upper; }
+
+    /*! merges two boxes */
+    __forceinline static const BBox merge (const BBox& a, const BBox& b) {
+      return BBox(min(a.lower, b.lower), max(a.upper, b.upper));
+    }
+
+    /*! intersects two boxes */
+    __forceinline static const BBox intersect (const BBox& a, const BBox& b) {
+      return BBox(max(a.lower, b.lower), min(a.upper, b.upper));
+    }
+
+     /*! enlarge box by some scaling factor */
+    __forceinline BBox enlarge_by(const float a) const {
+      return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper));
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( TrueTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {}
+  };
+
+  template<> __forceinline bool BBox<float>::empty() const {
+    return lower > upper;
+  }
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline bool BBox<Vec3fa>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+  template<> __forceinline bool BBox<Vec3fx>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+#endif
+
+  /*! tests if box is finite */
+  __forceinline bool isvalid( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)));
+  }
+
+  /*! tests if box is finite and non-empty*/
+  __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper));
+  }
+  
+  /*! tests if box has finite entries */
+  __forceinline bool is_finite( const BBox<Vec3fa>& b) {
+    return is_finite(b.lower) && is_finite(b.upper);
+  }
+
+  /*! test if point contained in box */
+  __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); }
+
+  /*! computes the center of the box */
+  template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; }
+  template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume    ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); }
+  __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume( const BBox<Vec3f>& b )  { return reduce_mul(b.size()); }
+
+  /*! computes the surface area of a bounding box */
+  template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; }
+
+  template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); }
+  template<typename T> __forceinline const T     area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); }
+
+  template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); }
+
+  template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) {
+    return halfArea(box);
+  }
+
+  /*! merges bounding boxes and points */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const       T& b ) { return BBox<T>(min(a.lower, b    ), max(a.upper, b    )); }
+  template<typename T> __forceinline const BBox<T> merge( const       T& a, const BBox<T>& b ) { return BBox<T>(min(a    , b.lower), max(a    , b.upper)); }
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); }
+
+  /*! Merges three boxes. */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); }
+
+  /*! Merges four boxes. */
+  template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) {
+    return merge(merge(a,b),merge(c,d));
+  }
+
+  /*! Comparison Operators */
+  template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; }
+  template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; }
+
+  /*! scaling */
+  template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+  template<typename T> __forceinline BBox<T> operator *( const     T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+
+  /*! translations */
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); }
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower+b      ,a.upper+b      ); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower-b      ,a.upper-b      ); }
+
+  /*! extension */
+  template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); }
+
+  /*! intersect bounding boxes */
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); }
+
+  /*! subtract bounds from each other */
+  template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d)
+  {
+    c.lower = a.lower;
+    c.upper = min(a.upper,b.lower);
+    d.lower = max(a.lower,b.upper);
+    d.upper = a.upper;
+  }
+
+  /*! tests if bounding boxes (and points) are disjoint (empty intersection) */
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); }
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const       T& b ) { return disjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool disjoint( const       T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); }
+
+  /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); }
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const       T& b ) { return conjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool conjoint( const       T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); }
+
+  /*! subset relation */
+  template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b )
+  { 
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false;
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false;
+    return true; 
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
+    return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper));
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
+    return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper));
+  }
+  
+  /*! blending */
+  template<typename T>
+    __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) {
+    return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t));
+  }
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) {
+    return cout << "[" << box.lower << "; " << box.upper << "]";
+  }
+
+  /*! default template instantiations */
+  typedef BBox<float> BBox1f;
+  typedef BBox<Vec2f> BBox2f;
+  typedef BBox<Vec2fa> BBox2fa;
+  typedef BBox<Vec3f> BBox3f;
+  typedef BBox<Vec3fa> BBox3fa;
+  typedef BBox<Vec3fx> BBox3fx;
+  typedef BBox<Vec3ff> BBox3ff;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined (__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<int N>
+    __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat4>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat8>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              (vfloat4&)bounds[4].lower,
+              (vfloat4&)bounds[5].lower,
+              (vfloat4&)bounds[6].lower,
+              (vfloat4&)bounds[7].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              (vfloat4&)bounds[4].upper,
+              (vfloat4&)bounds[5].upper,
+              (vfloat4&)bounds[6].upper,
+              (vfloat4&)bounds[7].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+#endif
+  
+  template<int N>
+    __forceinline BBox3fa merge(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox3fa merge<4>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower),
+                             min(bounds[2].lower,bounds[3].lower));
+    const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper),
+                             max(bounds[2].upper,bounds[3].upper));
+    return BBox3fa(lower,upper);
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox3fa merge<8>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)),
+                             min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower)));
+    const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)),
+                             max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper)));
+    return BBox3fa(lower,upper);
+  }
+#endif
+}
+
--- a/thirdparty/embree/common/math/col3.h
+++ b/thirdparty/embree/common/math/col3.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "emath.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col3
+  {
+    T r, g, b;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3           ( )                   { }
+    __forceinline Col3           ( const Col3& other ) { r = other.r; g = other.g; b = other.b; }
+    __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; }
+
+    __forceinline explicit Col3 (const T& v)                         : r(v), g(v), b(v) {}
+    __forceinline          Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)    {}
+    __forceinline Col3 (OneTy)    : r(one)    , g(one)    , b(one)     {}
+    __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {}
+    __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col3<unsigned char> Col3uc;
+  typedef Col3<float        > Col3f;
+}
--- a/thirdparty/embree/common/math/col4.h
+++ b/thirdparty/embree/common/math/col4.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "emath.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col4
+  {
+    T r, g, b, a;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4           ( )                   { }
+    __forceinline Col4           ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; }
+    __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; }
+
+    __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {}
+    __forceinline          Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)   , a(zero) {}
+    __forceinline Col4 (OneTy)    : r(one)    , g(one)    , b(one)    , a(one) {}
+    __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {}
+    __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col4<unsigned char> Col4uc;
+  typedef Col4<float        > Col4f;
+}
--- a/thirdparty/embree/common/math/color.h
+++ b/thirdparty/embree/common/math/color.h
@@ -0,0 +1,268 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "color_sycl.h"
+#else
+
+#include "constants.h"
+#include "col3.h"
+#include "col4.h"
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color4
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b,a; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4 () {}
+    __forceinline Color4 ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {}
+
+    __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col3f&  other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); }
+    __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col4f&  other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); }
+
+    __forceinline Color4           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; }
+    __forceinline void set(Col3uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+      d.a = (unsigned char)(s[3]); 
+    }
+    __forceinline void set(float &f) const
+    {
+      f = 0.2126f*r+0.7125f*g+0.0722f*b; // sRGB luminance.
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color4( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color () {}
+    __forceinline Color ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color  (const float v)                               : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color  (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {}
+
+    __forceinline Color           ( const Color& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; }
+
+    __forceinline Color           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; }
+    __forceinline void set(Col3uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+      d.a = 255; 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a ) { return a; }
+  __forceinline const Color operator -( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline const Color abs  ( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline const Color rcp  ( const Color& a )
+  {
+#if defined(__aarch64__)
+    __m128 reciprocal = _mm_rcp_ps(a.m128);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    return (const Color)reciprocal;
+#else
+#if defined(__AVX512VL__)
+    const Color r = _mm_rcp14_ps(a.m128);
+#else
+    const Color r = _mm_rcp_ps(a.m128);
+#endif
+    return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(a, r))));   // computes r + r * (1 - a * r)
+
+#endif  //defined(__aarch64__)
+  }
+  __forceinline const Color rsqrt( const Color& a )
+  {
+#if defined(__aarch64__)
+    __m128 r = _mm_rsqrt_ps(a.m128);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    return r;
+#else
+
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+
+#endif  //defined(__aarch64__)
+  }
+  __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const float  b ) { return a * Color(b); }
+  __forceinline const Color operator *( const float  a, const Color& b ) { return Color(a) * b; }
+  __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); }
+  __forceinline const Color operator /( const Color& a, const float  b ) { return a * rcp(b); }
+
+  __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; }
+  __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; }
+  __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; }
+  __forceinline const Color operator*=(Color& a, const float b      ) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const float b      ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; }
+  __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; }
+  __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); }
+  __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+  __forceinline bool operator < ( const Color& a, const Color& b ) {
+    if (a.r != b.r) return a.r < b.r;
+    if (a.g != b.g) return a.g < b.g;
+    if (a.b != b.b) return a.b < b.b;
+    return false;
+  }
+
+   ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color select( bool s, const Color& t, const Color& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Special Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  /*! computes luminance of a color */
+  __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); }
+
+  /*! output operator */
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+}
+
+#endif
--- a/thirdparty/embree/common/math/color_sycl.h
+++ b/thirdparty/embree/common/math/color_sycl.h
@@ -0,0 +1,219 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "constants.h"
+#include "col3.h"
+#include "col4.h"
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color4
+  {
+    struct { float r,g,b,a; };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4 () {}
+    //__forceinline Color4 ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color4 (const float v) : r(v), g(v), b(v), a(v) {}
+    __forceinline          Color4 (const float r, const float g, const float b, const float a) : r(r), g(g), b(b), a(a) {}
+
+    __forceinline explicit Color4 ( const Col3uc& other ) : r(other.r/255.0f), g(other.g/255.0f), b(other.b/255.0f), a(1.0f) {}
+    __forceinline explicit Color4 ( const Col3f&  other ) : r(other.r), g(other.g), b(other.b), a(1.0f) {}
+    __forceinline explicit Color4 ( const Col4uc& other ) : r(other.r/255.0f), g(other.g/255.0f), b(other.b/255.0f), a(other.a/255.0f) {}
+    __forceinline explicit Color4 ( const Col4f&  other ) : r(other.r), g(other.g), b(other.b), a(other.a) {}
+
+    //__forceinline Color4           ( const Color4& other ) : m128(other.m128) {}
+    //__forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; }
+
+    __forceinline void set(Col3uc& d) const 
+    {
+      d.r = (unsigned char)(clamp(r)*255.0f); 
+      d.g = (unsigned char)(clamp(g)*255.0f); 
+      d.b = (unsigned char)(clamp(b)*255.0f);
+    }
+    
+    __forceinline void set(Col4uc& d) const 
+    {
+      d.r = (unsigned char)(clamp(r)*255.0f); 
+      d.g = (unsigned char)(clamp(g)*255.0f); 
+      d.b = (unsigned char)(clamp(b)*255.0f); 
+      d.a = (unsigned char)(clamp(a)*255.0f);
+    }
+    __forceinline void set(float &f) const
+    {
+      f = 0.2126f*r+0.7125f*g+0.0722f*b; // sRGB luminance.
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4( ZeroTy   ) : r(0.0f), g(0.0f), b(0.0f), a(0.0f) {}
+    __forceinline Color4( OneTy    ) : r(1.0f), g(1.0f), b(1.0f), a(1.0f) {}
+    //__forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    //__forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color
+  {
+    struct { float r,g,b; };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color () {}
+    //__forceinline Color ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color  (const float v)  : r(v), g(v), b(v) {}
+    __forceinline          Color  (const float r, const float g, const float b) : r(r), g(g), b(b) {}
+
+    //__forceinline Color           ( const Color& other ) : m128(other.m128) {}
+    //__forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; }
+
+    //__forceinline Color           ( const Color4& other ) : m128(other.m128) {}
+    //__forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; }
+
+#if 0
+    __forceinline void set(Col3uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+      d.a = 255;  
+    }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color( ZeroTy   ) : r(0.0f), g(0.0f), b(0.0f) {}
+    __forceinline Color( OneTy    ) : r(1.0f), g(1.0f), b(1.0f) {}
+    //__forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    //__forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a ) { return a; }
+  __forceinline const Color operator -( const Color& a ) { return Color(-a.r, -a.g, -a.b); }
+  __forceinline const Color abs  ( const Color& a ) { return Color(abs(a.r), abs(a.g), abs(a.b)); }
+  __forceinline const Color rcp  ( const Color& a ) { return Color(1.0f/a.r, 1.0f/a.g, 1.0f/a.b); }
+  __forceinline const Color rsqrt( const Color& a ) { return Color(1.0f/sqrt(a.r), 1.0f/sqrt(a.g), 1.0f/sqrt(a.b)); }
+  __forceinline const Color sqrt ( const Color& a ) { return Color(sqrt(a.r), sqrt(a.g), sqrt(a.b)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a, const Color& b ) { return Color(a.r+b.r, a.g+b.g, a.b+b.b); }
+  __forceinline const Color operator -( const Color& a, const Color& b ) { return Color(a.r-b.r, a.g-b.g, a.b-b.b); }
+  __forceinline const Color operator *( const Color& a, const Color& b ) { return Color(a.r*b.r, a.g*b.g, a.b*b.b); }
+  __forceinline const Color operator *( const Color& a, const float  b ) { return a * Color(b); }
+  __forceinline const Color operator *( const float  a, const Color& b ) { return Color(a) * b; }
+  __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); }
+  __forceinline const Color operator /( const Color& a, const float  b ) { return a * rcp(b); }
+
+  __forceinline const Color min( const Color& a, const Color& b ) { return Color(min(a.r,b.r), min(a.g,b.g), min(a.b,b.b)); }
+  __forceinline const Color max( const Color& a, const Color& b ) { return Color(max(a.r,b.r), max(a.g,b.g), max(a.b,b.b)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; }
+  __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; }
+  __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; }
+  __forceinline const Color operator*=(Color& a, const float b      ) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const float b      ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; }
+  __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; }
+  __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); }
+  __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Color& a, const Color& b ) { return a.r == b.r && a.g == b.g && a.b == b.b; }
+  __forceinline bool operator !=( const Color& a, const Color& b ) { return a.r != b.r || a.g != b.g || a.b != b.b; }
+  __forceinline bool operator < ( const Color& a, const Color& b ) {
+    if (a.r != b.r) return a.r < b.r;
+    if (a.g != b.g) return a.g < b.g;
+    if (a.b != b.b) return a.b < b.b;
+    return false;
+  }
+
+   ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color select( bool s, const Color& t, const Color& f ) {
+    return s ? t : f;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Special Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  /*! computes luminance of a color */
+  __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); }
+
+  /*! output operator */
+  inline std::ostream& operator<<(std::ostream& cout, const Color& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+}
--- a/thirdparty/embree/common/math/constants.cpp
+++ b/thirdparty/embree/common/math/constants.cpp
@@ -0,0 +1,8 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "constants.h"
+
+namespace embree
+{
+}
--- a/thirdparty/embree/common/math/constants.h
+++ b/thirdparty/embree/common/math/constants.h
@@ -0,0 +1,209 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+#include <limits>
+
+#define _USE_MATH_DEFINES
+#include <math.h> // using cmath causes issues under Windows
+#include <cfloat>
+#include <climits>
+
+namespace embree
+{
+  static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
+  static MAYBE_UNUSED const float min_rcp_input = 1E-18f;  // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail
+
+  /* we consider floating point numbers in that range as valid input numbers */
+  static MAYBE_UNUSED float FLT_LARGE = 1.844E18f;
+
+  struct TrueTy {
+    __forceinline operator bool( ) const { return true; }
+  };
+
+  const constexpr TrueTy True = TrueTy();
+
+  struct FalseTy {
+    __forceinline operator bool( ) const { return false; }
+  };
+
+  const constexpr FalseTy False = FalseTy();
+  
+  struct ZeroTy
+  {
+    __forceinline operator          double   ( ) const { return 0; }
+    __forceinline operator          float    ( ) const { return 0; }
+    __forceinline operator          long long( ) const { return 0; }
+    __forceinline operator unsigned long long( ) const { return 0; }
+    __forceinline operator          long     ( ) const { return 0; }
+    __forceinline operator unsigned long     ( ) const { return 0; }
+    __forceinline operator          int      ( ) const { return 0; }
+    __forceinline operator unsigned int      ( ) const { return 0; }
+    __forceinline operator          short    ( ) const { return 0; }
+    __forceinline operator unsigned short    ( ) const { return 0; }
+    __forceinline operator          char     ( ) const { return 0; }
+    __forceinline operator unsigned char     ( ) const { return 0; }
+  }; 
+
+  const constexpr ZeroTy zero = ZeroTy();
+
+  struct OneTy
+  {
+    __forceinline operator          double   ( ) const { return 1; }
+    __forceinline operator          float    ( ) const { return 1; }
+    __forceinline operator          long long( ) const { return 1; }
+    __forceinline operator unsigned long long( ) const { return 1; }
+    __forceinline operator          long     ( ) const { return 1; }
+    __forceinline operator unsigned long     ( ) const { return 1; }
+    __forceinline operator          int      ( ) const { return 1; }
+    __forceinline operator unsigned int      ( ) const { return 1; }
+    __forceinline operator          short    ( ) const { return 1; }
+    __forceinline operator unsigned short    ( ) const { return 1; }
+    __forceinline operator          char     ( ) const { return 1; }
+    __forceinline operator unsigned char     ( ) const { return 1; }
+  };
+
+  const constexpr OneTy one = OneTy();
+
+  struct NegInfTy
+  {
+    __forceinline operator          double   ( ) const { return -std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return -std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::min(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::min(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::min(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::min(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::min(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::min(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::min(); }
+    __forceinline operator          char     ( ) const { return std::numeric_limits<char>::min(); }
+    __forceinline operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::min(); }
+
+  };
+
+  const constexpr NegInfTy neg_inf = NegInfTy();
+
+  struct PosInfTy
+  {
+    __forceinline operator          double   ( ) const { return std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::max(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::max(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::max(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::max(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::max(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::max(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::max(); }
+    __forceinline operator          char     ( ) const { return std::numeric_limits<char>::max(); }
+    __forceinline operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::max(); }
+  };
+
+  const constexpr PosInfTy     inf = PosInfTy();
+  const constexpr PosInfTy pos_inf = PosInfTy();
+
+  struct NaNTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
+  };
+
+  const constexpr NaNTy nan = NaNTy();
+
+  struct UlpTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
+  };
+  
+  const constexpr UlpTy ulp = UlpTy();
+
+  struct PiTy
+  {
+    __forceinline operator double( ) const { return double(M_PI); }
+    __forceinline operator float ( ) const { return float(M_PI); }
+  };
+
+  const constexpr PiTy pi = PiTy();
+
+  struct OneOverPiTy
+  {
+    __forceinline operator double( ) const { return double(M_1_PI); }
+    __forceinline operator float ( ) const { return float(M_1_PI); }
+  };
+
+  const constexpr OneOverPiTy one_over_pi = OneOverPiTy();
+
+  struct TwoPiTy
+  {
+    __forceinline operator double( ) const { return double(2.0*M_PI); }
+    __forceinline operator float ( ) const { return float(2.0*M_PI); }
+  };
+
+  const constexpr TwoPiTy two_pi = TwoPiTy();
+
+  struct OneOverTwoPiTy
+  {
+    __forceinline operator double( ) const { return double(0.5*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.5*M_1_PI); }
+  };
+
+  const constexpr OneOverTwoPiTy one_over_two_pi = OneOverTwoPiTy();
+
+  struct FourPiTy
+  {
+    __forceinline operator double( ) const { return double(4.0*M_PI); } 
+    __forceinline operator float ( ) const { return float(4.0*M_PI); }
+  };
+
+  const constexpr FourPiTy four_pi = FourPiTy();
+
+  struct OneOverFourPiTy
+  {
+    __forceinline operator double( ) const { return double(0.25*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.25*M_1_PI); }
+  };
+
+  const constexpr OneOverFourPiTy one_over_four_pi = OneOverFourPiTy();
+
+  struct StepTy {
+    __forceinline operator          double   ( ) const { return 0; }
+    __forceinline operator          float    ( ) const { return 0; }
+    __forceinline operator          long long( ) const { return 0; }
+    __forceinline operator unsigned long long( ) const { return 0; }
+    __forceinline operator          long     ( ) const { return 0; }
+    __forceinline operator unsigned long     ( ) const { return 0; }
+    __forceinline operator          int      ( ) const { return 0; }
+    __forceinline operator unsigned int      ( ) const { return 0; }
+    __forceinline operator          short    ( ) const { return 0; }
+    __forceinline operator unsigned short    ( ) const { return 0; }
+    __forceinline operator          char     ( ) const { return 0; }
+    __forceinline operator unsigned char     ( ) const { return 0; }
+  };
+
+  const constexpr StepTy step = StepTy();
+
+  struct ReverseStepTy {
+  };
+
+  const constexpr ReverseStepTy reverse_step = ReverseStepTy();
+
+  struct EmptyTy {
+  };
+
+  const constexpr EmptyTy empty = EmptyTy();
+
+  struct FullTy {
+  };
+
+  const constexpr FullTy full = FullTy();
+
+  struct UndefinedTy {
+  };
+
+  const constexpr UndefinedTy undefined = UndefinedTy();
+}
--- a/thirdparty/embree/common/math/emath.h
+++ b/thirdparty/embree/common/math/emath.h
@@ -0,0 +1,468 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "constants.h"
+#include <cmath>
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "math_sycl.h"
+#else
+
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+#endif
+
+#if defined(__WIN32__)
+#if defined(_MSC_VER) && (_MSC_VER <= 1700)
+namespace std
+{
+  __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
+  __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; }
+  __forceinline bool isfinite (const float x) { return _finite(x) != 0; }
+}
+#endif
+#endif
+
+namespace embree
+{
+  __forceinline bool isvalid ( const float& v ) {
+    return (v > -FLT_LARGE) & (v < +FLT_LARGE);
+  }
+
+  __forceinline int cast_f2i(float f) {
+    union { float f; int i; } v; v.f = f; return v.i;
+  }
+
+  __forceinline float cast_i2f(int i) {
+    union { float f; int i; } v; v.i = i; return v.f;
+  }
+
+  __forceinline int   toInt  (const float& a) { return int(a); }
+  __forceinline float toFloat(const int&   a) { return float(a); }
+
+  __forceinline int   asInt  (const float& a) { return *((int*)&a); }
+  __forceinline float asFloat(const int&   a) { return *((float*)&a); }
+
+#if defined(__WIN32__)
+  __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
+#endif
+
+  __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }
+  __forceinline float sqr  ( const float x ) { return x*x; }
+
+  __forceinline float rcp  ( const float x )
+  {
+#if defined(__aarch64__)
+      // Move scalar to vector register and do rcp.
+      __m128 a;
+      a[0] = x;
+      float32x4_t reciprocal = vrecpeq_f32(a);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      return reciprocal[0];
+#else
+
+    const __m128 a = _mm_set_ss(x);
+
+#if defined(__AVX512VL__)
+    const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a);
+#else
+    const __m128 r = _mm_rcp_ss(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#else
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#endif
+
+#endif  //defined(__aarch64__)
+  }
+
+  __forceinline float signmsk ( const float x ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = 0x80000000;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#endif
+  }
+  __forceinline float xorf( const float x, const float y ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128 b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_xor_ps(a, b);
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+#endif
+  }
+  __forceinline float andf( const float x, const unsigned y ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+#endif
+  }
+  __forceinline float rsqrt( const float x )
+  {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      a[0] = x;
+      __m128 value = _mm_rsqrt_ps(a);
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      return value[0];
+#else
+
+    const __m128 a = _mm_set_ss(x);
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
+#else
+    __m128 r = _mm_rsqrt_ss(a);
+#endif
+    const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
+                                _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+    return _mm_cvtss_f32(c);
+#endif
+  }
+
+#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
+  __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
+  __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
+  __forceinline int roundf(float f) { return (int)(f + 0.5f); }
+#else
+  __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); }
+  __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); }
+#endif
+
+  __forceinline float abs  ( const float x ) { return ::fabsf(x); }
+  __forceinline float acos ( const float x ) { return ::acosf (x); }
+  __forceinline float asin ( const float x ) { return ::asinf (x); }
+  __forceinline float atan ( const float x ) { return ::atanf (x); }
+  __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); }
+  __forceinline float cos  ( const float x ) { return ::cosf  (x); }
+  __forceinline float cosh ( const float x ) { return ::coshf (x); }
+  __forceinline float exp  ( const float x ) { return ::expf  (x); }
+  __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); }
+  __forceinline float log  ( const float x ) { return ::logf  (x); }
+  __forceinline float log10( const float x ) { return ::log10f(x); }
+  __forceinline float pow  ( const float x, const float y ) { return ::powf  (x, y); }
+  __forceinline float sin  ( const float x ) { return ::sinf  (x); }
+  __forceinline float sinh ( const float x ) { return ::sinhf (x); }
+  __forceinline float sqrt ( const float x ) { return ::sqrtf (x); }
+  __forceinline float tan  ( const float x ) { return ::tanf  (x); }
+  __forceinline float tanh ( const float x ) { return ::tanhf (x); }
+  __forceinline float floor( const float x ) { return ::floorf (x); }
+  __forceinline float ceil ( const float x ) { return ::ceilf (x); }
+  __forceinline float frac ( const float x ) { return x-floor(x); }
+
+  __forceinline double abs  ( const double x ) { return ::fabs(x); }
+  __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }
+  __forceinline double acos ( const double x ) { return ::acos (x); }
+  __forceinline double asin ( const double x ) { return ::asin (x); }
+  __forceinline double atan ( const double x ) { return ::atan (x); }
+  __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }
+  __forceinline double cos  ( const double x ) { return ::cos  (x); }
+  __forceinline double cosh ( const double x ) { return ::cosh (x); }
+  __forceinline double exp  ( const double x ) { return ::exp  (x); }
+  __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }
+  __forceinline double log  ( const double x ) { return ::log  (x); }
+  __forceinline double log10( const double x ) { return ::log10(x); }
+  __forceinline double pow  ( const double x, const double y ) { return ::pow  (x, y); }
+  __forceinline double rcp  ( const double x ) { return 1.0/x; }
+  __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }
+  __forceinline double sin  ( const double x ) { return ::sin  (x); }
+  __forceinline double sinh ( const double x ) { return ::sinh (x); }
+  __forceinline double sqr  ( const double x ) { return x*x; }
+  __forceinline double sqrt ( const double x ) { return ::sqrt (x); }
+  __forceinline double tan  ( const double x ) { return ::tan  (x); }
+  __forceinline double tanh ( const double x ) { return ::tanh (x); }
+  __forceinline double floor( const double x ) { return ::floor (x); }
+  __forceinline double ceil ( const double x ) { return ::ceil (x); }
+
+#if defined(__aarch64__)
+    __forceinline float mini(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_min_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
+  __forceinline float mini(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_min_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+#if defined(__aarch64__)
+    __forceinline float maxi(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_max_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
+  __forceinline float maxi(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_max_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+  template<typename T>
+    __forceinline T twice(const T& a) { return a+a; }
+
+  __forceinline      int min(int      a, int      b) { return a<b ? a:b; }
+  __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; }
+  __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
+  __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
+  __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
+  __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
+#endif
+#if defined(__EMSCRIPTEN__)
+  __forceinline   long   min(long     a, long     b) { return a<b ? a:b; }
+#endif
+
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }
+
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }
+
+  __forceinline      int max(int      a, int      b) { return a<b ? b:a; }
+  __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; }
+  __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
+  __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
+  __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
+  __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
+#endif
+#if defined(__EMSCRIPTEN__)
+  __forceinline   long   max(long     a, long     b) { return a<b ? b:a; }
+#endif
+
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }
+
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }
+
+#if defined(__MACOSX__)
+  __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; }
+  __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; }
+#endif
+
+#if defined(__MACOSX__) && !defined(__INTEL_COMPILER)
+  __forceinline void sincosf(float x, float *sin, float *cos) {
+    __sincosf(x,sin,cos);
+  }
+#endif
+
+#if defined(__WIN32__) || defined(__FreeBSD__)
+  __forceinline void sincosf(float x, float *s, float *c) {
+    *s = sinf(x); *c = cosf(x);
+  }
+#endif
+
+  template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }
+  template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }
+
+  template<typename T> __forceinline T  deg2rad ( const T& x )  { return x * T(1.74532925199432957692e-2f); }
+  template<typename T> __forceinline T  rad2deg ( const T& x )  { return x * T(5.72957795130823208768e1f); }
+  template<typename T> __forceinline T  sin2cos ( const T& x )  { return sqrt(max(T(zero),T(one)-x*x)); }
+  template<typename T> __forceinline T  cos2sin ( const T& x )  { return sin2cos(x); }
+
+#if defined(__AVX2__)
+  __forceinline float madd  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float msub  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  
+#elif defined (__aarch64__) && defined(__clang__)
+#pragma clang fp contract(fast)
+__forceinline float madd  ( const float a, const float b, const float c) { return a*b + c; }
+__forceinline float msub  ( const float a, const float b, const float c) { return a*b - c; }
+__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
+__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
+#pragma clang fp contract(on)
+  
+#else
+  __forceinline float madd  ( const float a, const float b, const float c) { return a*b+c; }
+  __forceinline float msub  ( const float a, const float b, const float c) { return a*b-c; }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;}
+  __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; }
+#endif
+
+  /*! random functions */
+  template<typename T> T random() { return T(0); }
+#if defined(_WIN32)
+  template<> __forceinline int      random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); }
+#else
+  template<> __forceinline int      random() { return int(rand()); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }
+#endif
+  template<> __forceinline float  random() { return rand()/float(RAND_MAX); }
+  template<> __forceinline double random() { return rand()/double(RAND_MAX); }
+
+#if _WIN32
+  __forceinline double drand48() {
+    return double(rand())/double(RAND_MAX);
+  }
+
+  __forceinline void srand48(long seed) {
+    return srand(seed);
+  }
+#endif
+
+  /*! selects */
+  __forceinline bool  select(bool s, bool  t , bool f) { return s ? t : f; }
+  __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; }
+  __forceinline float select(bool s, float t, float f) { return s ? t : f; }
+
+  __forceinline bool none(bool s) { return !s; }
+  __forceinline bool all (bool s) { return s; }
+  __forceinline bool any (bool s) { return s; }
+
+  __forceinline unsigned movemask (bool s) { return (unsigned)s; }
+
+  __forceinline float lerp(const float v0, const float v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  template<typename T>
+    __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {
+    return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));
+  }
+
+  /*! exchange */
+  template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
+
+  /*  load/store */
+  template<typename Ty> struct mem;
+ 
+  template<> struct mem<float> {
+    static __forceinline float load (bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+    static __forceinline float loadu(bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+  
+    static __forceinline void store (bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+    static __forceinline void storeu(bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+  };
+  
+  /*! bit reverse operation */
+  template<class T>
+    __forceinline T bitReverse(const T& vin)
+  {
+    T v = vin;
+    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+    v = ( v >> 16             ) | ( v               << 16);
+    return v;
+  }
+
+  /*! bit interleave operation */
+  template<class T>
+    __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
+  {
+	T x = xin, y = yin, z = zin;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x <<  8)) & 0x0300F00F;
+    x = (x | (x <<  4)) & 0x030C30C3;
+    x = (x | (x <<  2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y <<  8)) & 0x0300F00F;
+    y = (y | (y <<  4)) & 0x030C30C3;
+    y = (y | (y <<  2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z <<  8)) & 0x0300F00F;
+    z = (z | (z <<  4)) & 0x030C30C3;
+    z = (z | (z <<  2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+
+#if defined(__AVX2__) && !defined(__aarch64__)
+
+  template<>
+    __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
+  {
+    const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ );
+    const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */);
+    const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */);
+    return xx | yy | zz;
+  }
+
+#endif
+
+  /*! bit interleave operation for 64bit data types*/
+  template<class T>
+    __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){
+    T x = xin & 0x1fffff;
+    T y = yin & 0x1fffff;
+    T z = zin & 0x1fffff;
+
+    x = (x | x << 32) & 0x1f00000000ffff;
+    x = (x | x << 16) & 0x1f0000ff0000ff;
+    x = (x | x << 8) & 0x100f00f00f00f00f;
+    x = (x | x << 4) & 0x10c30c30c30c30c3;
+    x = (x | x << 2) & 0x1249249249249249;
+
+    y = (y | y << 32) & 0x1f00000000ffff;
+    y = (y | y << 16) & 0x1f0000ff0000ff;
+    y = (y | y << 8) & 0x100f00f00f00f00f;
+    y = (y | y << 4) & 0x10c30c30c30c30c3;
+    y = (y | y << 2) & 0x1249249249249249;
+
+    z = (z | z << 32) & 0x1f00000000ffff;
+    z = (z | z << 16) & 0x1f0000ff0000ff;
+    z = (z | z << 8) & 0x100f00f00f00f00f;
+    z = (z | z << 4) & 0x10c30c30c30c30c3;
+    z = (z | z << 2) & 0x1249249249249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+}
+
+#endif
--- a/thirdparty/embree/common/math/interval.h
+++ b/thirdparty/embree/common/math/interval.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+#include "bbox.h"
+
+namespace embree
+{
+  template<typename V>
+    struct Interval
+    {
+      V lower, upper;
+      
+      __forceinline Interval() {}
+      __forceinline Interval           ( const Interval& other ) { lower = other.lower; upper = other.upper; }
+      __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+      __forceinline Interval(const V& a) : lower(a), upper(a) {}
+      __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {}
+      __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {}
+          
+      /*! tests if box is empty */
+      //__forceinline bool empty() const { return lower > upper; }
+      
+      /*! computes the size of the interval */
+      __forceinline V size() const { return upper - lower; }
+      
+      __forceinline V center() const { return 0.5f*(lower+upper); }
+      
+      __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+      __forceinline const Interval& extend(const V   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+      
+      __forceinline friend Interval operator +( const Interval& a, const Interval& b ) {
+        return Interval(a.lower+b.lower,a.upper+b.upper);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const Interval& b ) {
+        return Interval(a.lower-b.upper,a.upper-b.lower);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const V& b ) {
+        return Interval(a.lower-b,a.upper-b);
+      }
+      
+      __forceinline friend Interval operator *( const Interval& a, const Interval& b )
+      {
+        const V ll = a.lower*b.lower;
+        const V lu = a.lower*b.upper;
+        const V ul = a.upper*b.lower;
+        const V uu = a.upper*b.upper;
+        return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b) {
+        return Interval(min(a.lower,b.lower),max(a.upper,b.upper));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) {
+        return merge(merge(a,b),c);
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) {
+        return merge(merge(a,b),merge(c,d));
+      }
+      
+      /*! intersect bounding boxes */
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); }       
+      
+      friend embree_ostream operator<<(embree_ostream cout, const Interval& a) {
+        return cout << "[" << a.lower << ", " << a.upper << "]";
+      }
+      
+      ////////////////////////////////////////////////////////////////////////////////
+      /// Constants
+      ////////////////////////////////////////////////////////////////////////////////
+      
+      __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+      __forceinline Interval( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    };
+
+  __forceinline bool isEmpty(const Interval<float>& v) { 
+    return v.lower > v.upper;
+  }
+
+  __forceinline vboolx isEmpty(const Interval<vfloatx>& v) {
+    return v.lower > v.upper;
+  }
+  
+  /*! subset relation */
+  template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) { 
+    return (a.lower > b.lower) && (a.upper < b.upper);
+  }
+
+  template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { 
+    return subset(a.x,b.x) && subset(a.y,b.y);
+  }
+
+  template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) {
+    return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1)
+  {
+    float eps = 1E-4f;
+    bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps;
+    bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps;
+    return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1);
+  }
+  
+  typedef Interval<float> Interval1f;
+  typedef Vec2<Interval<float>> Interval2f;
+  typedef Vec3<Interval<float>> Interval3f;
+
+inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; }
+
+inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); }
+
+#define TWO_PI (2.0*M_PI)
+inline Interval1f sin(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float sinLower = sin(interval.lower);
+  float sinUpper = sin(interval.upper);
+  if (sinLower > sinUpper) swap(sinLower, sinUpper);
+  if (interval.lower <       M_PI / 2.0 && interval.upper >       M_PI / 2.0) sinUpper =  1.0;
+  if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0;
+  return Interval1f(sinLower, sinUpper);
+}
+
+inline Interval1f cos(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float cosLower = cos(interval.lower);
+  float cosUpper = cos(interval.upper);
+  if (cosLower > cosUpper) swap(cosLower, cosUpper);
+  if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0;
+  return Interval1f(cosLower, cosUpper);
+}
+#undef TWO_PI
+}
--- a/thirdparty/embree/common/math/lbbox.h
+++ b/thirdparty/embree/common/math/lbbox.h
@@ -0,0 +1,331 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "range.h"
+
+namespace embree
+{
+  template<typename T>
+    __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt)
+  {
+    const float rcp_dt_size = float(1.0f)/dt.size();
+    const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size);
+    const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size);
+    return std::make_pair(g0,g1);
+  }
+
+  template<typename T>
+  struct LBBox
+  {
+  public:
+    __forceinline LBBox () {}
+
+    template<typename T1>
+    __forceinline LBBox ( const LBBox<T1>& other )
+    : bounds0(other.bounds0), bounds1(other.bounds1) {} 
+
+    __forceinline LBBox& operator= ( const LBBox& other ) { 
+      bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; 
+    }
+
+    __forceinline LBBox (EmptyTy) 
+      : bounds0(EmptyTy()), bounds1(EmptyTy()) {}
+    
+    __forceinline explicit LBBox ( const BBox<T>& bounds) 
+      : bounds0(bounds), bounds1(bounds) { }
+    
+    __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1) 
+      : bounds0(bounds0), bounds1(bounds1) { }
+
+    LBBox ( const avector<BBox<T>>& bounds ) 
+    {
+      assert(bounds.size());
+      BBox<T> b0 = bounds.front();
+      BBox<T> b1 = bounds.back();
+      for (size_t i=1; i<bounds.size()-1; i++) {
+        const float f = float(i)/float(bounds.size()-1);
+        const BBox<T> bt = lerp(b0,b1,f);
+        const T dlower = min(bounds[i].lower-bt.lower,T(zero));
+        const T dupper = max(bounds[i].upper-bt.upper,T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments)
+    {
+      const float lower = time_range.lower*numTimeSegments;
+      const float upper = time_range.upper*numTimeSegments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const int ilower = (int)ilowerf;
+      const int iupper = (int)iupperf;
+
+      const BBox<T> blower0 = bounds(ilower);
+      const BBox<T> bupper1 = bounds(iupper);
+
+      if (iupper-ilower == 1) {
+        bounds0 = lerp(blower0, bupper1, lower-ilowerf);
+        bounds1 = lerp(bupper1, blower0, iupperf-upper);
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilower+1);
+      const BBox<T> bupper0 = bounds(iupper-1);
+      BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf);
+      BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper);
+
+      for (int i = ilower+1; i < iupper; i++)
+      {
+        const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments)
+    {
+      /* normalize global time_range_in to local geom_time_range */
+      const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(),
+                              (time_range_in.upper-geom_time_range.lower)/geom_time_range.size());
+        
+      const float lower = time_range.lower*geom_time_segments;
+      const float upper = time_range.upper*geom_time_segments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const float ilowerfc = max(0.0f,ilowerf);
+      const float iupperfc = min(iupperf,geom_time_segments);
+      const int   ilowerc = (int)ilowerfc;
+      const int   iupperc = (int)iupperfc;
+      assert(iupperc-ilowerc > 0);
+
+      /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */
+      const int ilower_iter = max(-1,(int)ilowerf);
+      const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1);
+        
+      const BBox<T> blower0 = bounds(ilowerc);
+      const BBox<T> bupper1 = bounds(iupperc);
+      if (iupper_iter-ilower_iter == 1) {
+        bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc));
+        bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper));
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilowerc+1);
+      const BBox<T> bupper0 = bounds(iupperc-1);
+      BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc));
+      BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper));
+
+      for (int i = ilower_iter+1; i < iupper_iter; i++)
+      {
+        const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments)
+    {
+      const int ilower = time_range.begin();
+      const int iupper = time_range.end();
+
+      BBox<T> b0 = bounds(ilower);
+      BBox<T> b1 = bounds(iupper);
+
+      if (iupper-ilower == 1)
+      {
+        bounds0 = b0;
+        bounds1 = b1;
+        return;
+      }
+  
+      for (int i = ilower+1; i<iupper; i++)
+      {
+        const float f = float(i - time_range.begin()) / float(time_range.size());
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+     /*! calculates the linear bounds for target_time_range of primitive with it's time_range_in and bounds */
+    __forceinline LBBox(const BBox1f& time_range_in, const LBBox<T> lbounds, const BBox1f& target_time_range)
+    {
+      const BBox3f bounds0 = lbounds.bounds0;
+      const BBox3f bounds1 = lbounds.bounds1;
+      
+      /* normalize global target_time_range to local time_range_in */
+      const BBox1f time_range((target_time_range.lower-time_range_in.lower)/time_range_in.size(),
+                              (target_time_range.upper-time_range_in.lower)/time_range_in.size());
+
+      const BBox1f clipped_time_range(max(0.0f,time_range.lower), min(1.0f,time_range.upper));
+
+      /* compute bounds at begin and end of clipped time range */
+      BBox<T> b0 = lerp(bounds0,bounds1,clipped_time_range.lower);
+      BBox<T> b1 = lerp(bounds0,bounds1,clipped_time_range.upper);
+
+      /* make sure that b0 is properly bounded at time_range_in.lower */
+      {
+        const BBox<T> bt = lerp(b0, b1, (0.0f - time_range.lower) / time_range.size());
+        const T dlower = min(bounds0.lower-bt.lower, T(zero));
+        const T dupper = max(bounds0.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      /* make sure that b1 is properly bounded at time_range_in.upper */
+      {
+        const BBox<T> bt = lerp(b0, b1, (1.0f - time_range.lower) / time_range.size());
+        const T dlower = min(bounds1.lower-bt.lower, T(zero));
+        const T dupper = max(bounds1.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+      
+      this->bounds0 = b0;
+      this->bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds for target_time_range of primitive with it's time_range_in and bounds */
+    __forceinline LBBox(const BBox1f& time_range_in, const BBox<T>& bounds0, const BBox<T>& bounds1, const BBox1f& target_time_range)
+      : LBBox(time_range_in,LBBox(bounds0,bounds1),target_time_range) {}
+
+  public:
+
+    __forceinline bool empty() const {
+      return bounds().empty();
+    }
+
+    __forceinline BBox<T> bounds () const {
+      return merge(bounds0,bounds1);
+    }
+
+    __forceinline BBox<T> interpolate( const float t ) const {
+      return lerp(bounds0,bounds1,t);
+    }
+
+    __forceinline LBBox<T> interpolate( const BBox1f& dt ) const {
+      return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper));
+    }
+
+    __forceinline void extend( const LBBox& other ) {
+      bounds0.extend(other.bounds0);
+      bounds1.extend(other.bounds1);
+    }
+
+    __forceinline float expectedHalfArea() const;
+
+    __forceinline float expectedHalfArea(const BBox1f& dt) const {
+      return interpolate(dt).expectedHalfArea();
+    }
+
+    __forceinline float expectedApproxHalfArea() const {
+      return 0.5f*(halfArea(bounds0) + halfArea(bounds1));
+    }
+
+    /* calculates bounds for [0,1] time range from bounds in dt time range */
+    __forceinline LBBox global(const BBox1f& dt) const 
+    {
+      const float rcp_dt_size = 1.0f/dt.size();
+      const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size);
+      const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size);
+      return LBBox(b0,b1);
+    }
+
+    /*! Comparison Operators */
+    //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) {
+      return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }";
+    }
+
+  public:
+    BBox<T> bounds0, bounds1;
+  };
+
+  /*! tests if box is finite */
+  template<typename T>
+    __forceinline bool isvalid( const LBBox<T>& v ) {
+    return isvalid(v.bounds0) && isvalid(v.bounds1);
+  }
+
+  template<typename T>
+    __forceinline bool isvalid_non_empty( const LBBox<T>& v ) {
+    return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1);
+  }
+  
+  template<typename T>
+    __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1)
+  {
+    const T da = a1-a0;
+    const T db = b1-b0;
+    return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f);
+  }
+  
+  template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const 
+  {
+    const Vec3fa d0 = bounds0.size();
+    const Vec3fa d1 = bounds1.size();
+    return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z),
+                                   Vec3fa(d1.x,d1.y,d1.z),
+                                   Vec3fa(d0.y,d0.z,d0.x),
+                                   Vec3fa(d1.y,d1.z,d1.x)));
+  }
+
+  template<typename T>
+  __forceinline float expectedApproxHalfArea(const LBBox<T>& box) {
+    return box.expectedApproxHalfArea(); 
+  }
+
+  template<typename T>
+  __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) {
+    return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1));
+  }
+
+   /*! subset relation */
+  template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) {
+    return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1);
+  }
+
+  /*! default template instantiations */
+  typedef LBBox<float> LBBox1f;
+  typedef LBBox<Vec2f> LBBox2f;
+  typedef LBBox<Vec3f> LBBox3f;
+  typedef LBBox<Vec3fa> LBBox3fa;
+  typedef LBBox<Vec3fx> LBBox3fx;
+}
--- a/thirdparty/embree/common/math/linearspace2.h
+++ b/thirdparty/embree/common/math/linearspace2.h
@@ -0,0 +1,149 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 2D Linear Transform (2x2 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace2
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace2           ( ) {}
+
+    __forceinline LinearSpace2           ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; }
+    __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; }
+
+    template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace2(const Vector& vx, const Vector& vy)
+      : vx(vx), vy(vy) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, 
+                               const Scalar& m10, const Scalar& m11)
+      : vx(m00,m10), vy(m01,m11) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {}
+    __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace2 scale(const Vector& s) {
+      return LinearSpace2(s.x,   0,
+                          0  , s.y);
+    }
+
+    /*! return matrix for rotation */
+    static __forceinline LinearSpace2 rotate(const Scalar& r) {
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace2(c, -s,
+                          s,  c);
+    }
+
+    /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */
+    LinearSpace2 orthogonal() const 
+    {
+      LinearSpace2 m = *this;
+
+      // mirrored?
+      Scalar mirror(one);
+      if (m.det() < Scalar(zero)) {
+        m.vx = -m.vx;
+        mirror = -mirror;
+      }
+
+      // rotation
+      for (int i = 0; i < 99; i++) {
+        const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse());
+        const LinearSpace2 d = m_next - m;
+        m = m_next;
+        // norm^2 of difference small enough?
+        if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8)
+          break;
+      }
+
+      // rotation * mirror_x
+      return LinearSpace2(mirror*m.vx, m.vy);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy;
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> rcp       ( const LinearSpace2<T>& a ) { return a.inverse(); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+  template<typename T> __forceinline T               operator*(const LinearSpace2<T>& a, const T              & b) { return b.x*a.vx + b.y*a.vy; }
+  template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); }
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace2<Vec2f> LinearSpace2f;
+  typedef LinearSpace2<Vec2fa> LinearSpace2fa;
+}
--- a/thirdparty/embree/common/math/linearspace3.h
+++ b/thirdparty/embree/common/math/linearspace3.h
@@ -0,0 +1,217 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "quaternion.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 3D Linear Transform (3x3 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace3
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace3           ( ) {}
+
+    __forceinline LinearSpace3           ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; }
+    __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; }
+
+    template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz)
+      : vx(vx), vy(vy), vz(vz) {}
+
+    /*! construction from quaternion */
+    __forceinline LinearSpace3( const QuaternionT<Scalar>& q )
+      : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j))
+      , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i))
+      , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02,
+                               const Scalar& m10, const Scalar& m11, const Scalar& m12,
+                               const Scalar& m20, const Scalar& m21, const Scalar& m22)
+      : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); }
+
+    /*! returns third row of matrix */
+    __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {}
+    __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace3 scale(const Vector& s) {
+      return LinearSpace3(s.x,   0,   0,
+                          0  , s.y,   0,
+                          0  ,   0, s.z);
+    }
+
+    /*! return matrix for rotation around arbitrary axis */
+    static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) {
+      Vector u = normalize(_u);
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c,  u.x*u.y*(1-c)-u.z*s,    u.x*u.z*(1-c)+u.y*s,
+                          u.x*u.y*(1-c)+u.z*s,    u.y*u.y+(1-u.y*u.y)*c,  u.y*u.z*(1-c)-u.x*s,
+                          u.x*u.z*(1-c)-u.y*s,    u.y*u.z*(1-c)+u.x*s,    u.z*u.z+(1-u.z*u.z)*c);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy,vz;
+  };
+
+#if !defined(__SYCL_DEVICE_ONLY__)
+  
+  /*! compute transposed matrix */
+  template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { 
+    vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
+    return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); 
+  }
+#endif
+  
+  template<typename T>
+    __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { 
+    return xfm.transposed();
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> rcp       ( const LinearSpace3<T>& a ) { return a.inverse(); }
+
+  /* constructs a coordinate frame form a normalized normal */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N) 
+  {
+    const T dx0(0,N.z,-N.y);
+    const T dx1(-N.z,0,N.x);
+    const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+
+  /* constructs a coordinate frame from a normal and approximate x-direction */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi)
+  {
+    if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel
+    const T dx = normalize(cross(dxi,N));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+  
+  /* clamps linear space to range -1 to +1 */
+  template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) {
+    return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)),
+                           clamp(space.vy,T(-1.0f),T(1.0f)),
+                           clamp(space.vz,T(-1.0f),T(1.0f)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+  template<typename T> __forceinline T               operator*(const LinearSpace3<T>& a, const T              & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); }
+  template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); }
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; }
+
+  template<typename T> __forceinline T       xfmPoint (const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmVector(const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmNormal(const LinearSpace3<T>& s, const T      & a) { return xfmVector(s.inverse().transposed(),a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) {
+    return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz));
+  }
+
+  /*! blending */
+  template<typename T>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t) 
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace3<Vec3f> LinearSpace3f;
+  typedef LinearSpace3<Vec3fa> LinearSpace3fa;
+  typedef LinearSpace3<Vec3fx> LinearSpace3fx;
+  typedef LinearSpace3<Vec3ff> LinearSpace3ff;
+
+  template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>;
+  typedef LinearSpace3<Vec3<vfloat<4>>>  LinearSpace3vf4;
+  typedef LinearSpace3<Vec3<vfloat<8>>>  LinearSpace3vf8;
+  typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16;
+
+  /*! blending */
+  template<typename T, typename S>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0,
+                                       const LinearSpace3<T>& l1,
+                                       const S& t)
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+}
--- a/thirdparty/embree/common/math/math_sycl.h
+++ b/thirdparty/embree/common/math/math_sycl.h
@@ -0,0 +1,279 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "constants.h"
+#include <cmath>
+
+namespace embree
+{
+  __forceinline bool isvalid ( const float& v ) {
+    return (v > -FLT_LARGE) & (v < +FLT_LARGE);
+  }
+
+  __forceinline int cast_f2i(float f) {
+    return __builtin_bit_cast(int,f);
+  }
+
+  __forceinline float cast_i2f(int i) {
+    return __builtin_bit_cast(float,i);
+  }
+
+  __forceinline int   toInt  (const float& a) { return int(a); }
+  __forceinline float toFloat(const int&   a) { return float(a); }
+
+  __forceinline float asFloat(const int   a) { return __builtin_bit_cast(float,a); }
+  __forceinline int   asInt  (const float a) { return __builtin_bit_cast(int,a); }
+  
+  //__forceinline bool finite ( const float x ) { return _finite(x) != 0; }
+  __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }
+  __forceinline float sqr  ( const float x ) { return x*x; }
+
+  __forceinline float rcp  ( const float x ) {
+    return sycl::native::recip(x);
+  }
+
+  __forceinline float signmsk(const float a) { return asFloat(asInt(a) & 0x80000000); }
+  //__forceinline float signmsk ( const float x ) {
+  //  return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+  //}
+  //__forceinline float xorf( const float x, const float y ) {
+  //  return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+  //}
+  //__forceinline float andf( const float x, const unsigned y ) {
+  //  return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+  //}
+  
+  __forceinline float rsqrt( const float x ) {
+    return sycl::rsqrt(x);
+  }
+
+  //__forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
+  //__forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
+  //__forceinline int roundf(float f) { return (int)(f + 0.5f); }
+
+  __forceinline float abs  ( const float x ) { return sycl::fabs(x); }
+  __forceinline float acos ( const float x ) { return sycl::acos(x); }
+  __forceinline float asin ( const float x ) { return sycl::asin(x); }
+  __forceinline float atan ( const float x ) { return sycl::atan(x); }
+  __forceinline float atan2( const float y, const float x ) { return sycl::atan2(y, x); }
+  __forceinline float cos  ( const float x ) { return sycl::cos(x); }
+  __forceinline float cosh ( const float x ) { return sycl::cosh(x); }
+  __forceinline float exp  ( const float x ) { return sycl::exp(x); }
+  __forceinline float fmod ( const float x, const float y ) { return sycl::fmod(x, y); }
+  __forceinline float log  ( const float x ) { return sycl::log(x); }
+  __forceinline float log10( const float x ) { return sycl::log10(x); }
+  __forceinline float pow  ( const float x, const float y ) { return sycl::pow(x, y); }
+  __forceinline float sin  ( const float x ) { return sycl::sin(x); }
+  __forceinline float sinh ( const float x ) { return sycl::sinh(x); }
+  __forceinline float sqrt ( const float x ) { return sycl::sqrt(x); }
+  __forceinline float tan  ( const float x ) { return sycl::tan(x); }
+  __forceinline float tanh ( const float x ) { return sycl::tanh(x); }
+  __forceinline float floor( const float x ) { return sycl::floor(x); }
+  __forceinline float ceil ( const float x ) { return sycl::ceil(x); }
+  __forceinline float frac ( const float x ) { return x-floor(x); }
+
+  //__forceinline double abs  ( const double x ) { return ::fabs(x); }
+  //__forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }
+  //__forceinline double acos ( const double x ) { return ::acos (x); }
+  //__forceinline double asin ( const double x ) { return ::asin (x); }
+  //__forceinline double atan ( const double x ) { return ::atan (x); }
+  //__forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }
+  //__forceinline double cos  ( const double x ) { return ::cos  (x); }
+  //__forceinline double cosh ( const double x ) { return ::cosh (x); }
+  //__forceinline double exp  ( const double x ) { return ::exp  (x); }
+  //__forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }
+  //__forceinline double log  ( const double x ) { return ::log  (x); }
+  //__forceinline double log10( const double x ) { return ::log10(x); }
+  //__forceinline double pow  ( const double x, const double y ) { return ::pow  (x, y); }
+  //__forceinline double rcp  ( const double x ) { return 1.0/x; }
+  //__forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }
+  //__forceinline double sin  ( const double x ) { return ::sin  (x); }
+  //__forceinline double sinh ( const double x ) { return ::sinh (x); }
+  //__forceinline double sqr  ( const double x ) { return x*x; }
+  //__forceinline double sqrt ( const double x ) { return ::sqrt (x); }
+  //__forceinline double tan  ( const double x ) { return ::tan  (x); }
+  //__forceinline double tanh ( const double x ) { return ::tanh (x); }
+  //__forceinline double floor( const double x ) { return ::floor (x); }
+  //__forceinline double ceil ( const double x ) { return ::ceil (x); }
+
+/*
+#if defined(__SSE4_1__)
+  __forceinline float mini(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_min_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+#if defined(__SSE4_1__)
+  __forceinline float maxi(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_max_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+*/
+  
+  template<typename T>
+    __forceinline T twice(const T& a) { return a+a; }
+
+  __forceinline      int min(int      a, int      b) { return sycl::min(a,b); }
+  __forceinline unsigned min(unsigned a, unsigned b) { return sycl::min(a,b); }
+  __forceinline  int64_t min(int64_t  a, int64_t  b) { return sycl::min(a,b); }
+  __forceinline    float min(float    a, float    b) { return sycl::fmin(a,b); }
+  __forceinline   double min(double   a, double   b) { return sycl::fmin(a,b); }
+#if defined(__X86_64__)
+  __forceinline   size_t min(size_t   a, size_t   b) { return sycl::min(a,b); }
+#endif
+
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }
+
+//  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }
+//  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }
+//  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }
+
+  __forceinline      int max(int      a, int      b) { return sycl::max(a,b); }
+  __forceinline unsigned max(unsigned a, unsigned b) { return sycl::max(a,b); }
+  __forceinline  int64_t max(int64_t  a, int64_t  b) { return sycl::max(a,b); }
+  __forceinline    float max(float    a, float    b) { return sycl::fmax(a,b); }
+  __forceinline   double max(double   a, double   b) { return sycl::fmax(a,b); }
+#if defined(__X86_64__)
+  __forceinline   size_t max(size_t   a, size_t   b) { return sycl::max(a,b); }
+#endif
+
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }
+
+//  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }
+//  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }
+//  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }
+
+  template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }
+  template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }
+
+  template<typename T> __forceinline T  deg2rad ( const T& x )  { return x * T(1.74532925199432957692e-2f); }
+  template<typename T> __forceinline T  rad2deg ( const T& x )  { return x * T(5.72957795130823208768e1f); }
+  template<typename T> __forceinline T  sin2cos ( const T& x )  { return sqrt(max(T(zero),T(one)-x*x)); }
+  template<typename T> __forceinline T  cos2sin ( const T& x )  { return sin2cos(x); }
+
+  __forceinline float madd  ( const float a, const float b, const float c) { return +sycl::fma(+a,b,+c); }
+  __forceinline float msub  ( const float a, const float b, const float c) { return +sycl::fma(+a,b,-c); }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return +sycl::fma(-a,b,+c); }
+  __forceinline float nmsub ( const float a, const float b, const float c) { return -sycl::fma(+a,b,+c); }
+
+  /*! random functions */
+/*
+  template<typename T> T random() { return T(0); }
+  template<> __forceinline int      random() { return int(rand()); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }
+  template<> __forceinline float  random() { return rand()/float(RAND_MAX); }
+  template<> __forceinline double random() { return rand()/double(RAND_MAX); }
+*/
+  
+  /*! selects */
+  __forceinline bool  select(bool s, bool  t , bool f) { return s ? t : f; }
+  __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; }
+  __forceinline float select(bool s, float t, float f) { return s ? t : f; }
+
+  __forceinline bool none(bool s) { return !s; }
+  __forceinline bool all (bool s) { return s; }
+  __forceinline bool any (bool s) { return s; }
+
+  __forceinline unsigned movemask (bool s) { return (unsigned)s; }
+
+  __forceinline float lerp(const float v0, const float v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  template<typename T>
+    __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {
+    return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));
+  }
+
+  /*! exchange */
+  template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
+
+   /*  load/store */
+  template<typename Ty> struct mem;
+ 
+  template<> struct mem<float> {
+    static __forceinline float load (bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+    static __forceinline float loadu(bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+  
+    static __forceinline void store (bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+    static __forceinline void storeu(bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+  };
+  
+  /*! bit reverse operation */
+  template<class T>
+    __forceinline T bitReverse(const T& vin)
+  {
+    T v = vin;
+    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+    v = ( v >> 16             ) | ( v               << 16);
+    return v;
+  }
+
+  /*! bit interleave operation */
+  template<class T>
+    __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
+  {
+	T x = xin, y = yin, z = zin;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x <<  8)) & 0x0300F00F;
+    x = (x | (x <<  4)) & 0x030C30C3;
+    x = (x | (x <<  2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y <<  8)) & 0x0300F00F;
+    y = (y | (y <<  4)) & 0x030C30C3;
+    y = (y | (y <<  2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z <<  8)) & 0x0300F00F;
+    z = (z | (z <<  4)) & 0x030C30C3;
+    z = (z | (z <<  2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+
+  /*! bit interleave operation for 64bit data types*/
+  template<class T>
+    __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){
+    T x = xin & 0x1fffff;
+    T y = yin & 0x1fffff;
+    T z = zin & 0x1fffff;
+
+    x = (x | x << 32) & 0x1f00000000ffff;
+    x = (x | x << 16) & 0x1f0000ff0000ff;
+    x = (x | x << 8) & 0x100f00f00f00f00f;
+    x = (x | x << 4) & 0x10c30c30c30c30c3;
+    x = (x | x << 2) & 0x1249249249249249;
+
+    y = (y | y << 32) & 0x1f00000000ffff;
+    y = (y | y << 16) & 0x1f0000ff0000ff;
+    y = (y | y << 8) & 0x100f00f00f00f00f;
+    y = (y | y << 4) & 0x10c30c30c30c30c3;
+    y = (y | y << 2) & 0x1249249249249249;
+
+    z = (z | z << 32) & 0x1f00000000ffff;
+    z = (z | z << 16) & 0x1f0000ff0000ff;
+    z = (z | z << 8) & 0x100f00f00f00f00f;
+    z = (z | z << 4) & 0x10c30c30c30c30c3;
+    z = (z | z << 2) & 0x1249249249249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+}
--- a/thirdparty/embree/common/math/obbox.h
+++ b/thirdparty/embree/common/math/obbox.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "linearspace3.h"
+
+namespace embree
+{
+  /*! Oriented bounding box */
+  template<typename T>
+    struct OBBox 
+  {
+  public:
+    
+    __forceinline OBBox () {}
+    
+    __forceinline OBBox (EmptyTy) 
+      : space(one), bounds(empty) {}
+    
+    __forceinline OBBox (const BBox<T>& bounds) 
+      : space(one), bounds(bounds) {}
+      
+    __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds) 
+      : space(space), bounds(bounds) {}
+    
+    friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) {
+      return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}";
+    }
+    
+  public:
+    LinearSpace3<T> space; //!< orthonormal transformation
+    BBox<T> bounds;        //!< bounds in transformed space
+  };
+
+  typedef OBBox<Vec3f> OBBox3f;
+  typedef OBBox<Vec3fa> OBBox3fa;
+}
--- a/thirdparty/embree/common/math/quaternion.h
+++ b/thirdparty/embree/common/math/quaternion.h
@@ -0,0 +1,258 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "vec4.h"
+
+#include "transcendental.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////
+  // Quaternion Struct
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T>
+  struct QuaternionT
+  {
+    typedef Vec3<T> Vector;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT           ()                     { }
+    __forceinline QuaternionT           ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; }
+    __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; }
+
+    __forceinline          QuaternionT( const T& r       ) : r(r), i(zero), j(zero), k(zero) {}
+    __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {}
+    __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {}
+    __forceinline          QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {}
+    __forceinline          QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {}
+
+    __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz );
+    __inline QuaternionT( const T& yaw, const T& pitch, const T& roll );
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {}
+    __forceinline QuaternionT( OneTy  ) : r( one), i(zero), j(zero), k(zero) {}
+
+    /*! return quaternion for rotation around arbitrary axis */
+    static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) {
+      return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u));
+    }
+
+    /*! returns the rotation axis of the quaternion as a vector */
+    __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); }
+
+  public:
+    T r, i, j, k;
+  };
+
+  template<typename T> __forceinline QuaternionT<T> operator *( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); }
+
+  ////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline QuaternionT<T> conj      ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline T              abs       ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> rcp       ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(msub(a, q.r, p.r),
+                          msub(a, q.i, p.i),
+                          msub(a, q.j, p.j),
+                          msub(a, q.k, p.k));
+  }
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(madd(a, q.r, p.r),
+                          madd(a, q.i, p.i),
+                          madd(a, q.j, p.j),
+                          madd(a, q.k, p.k));
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r,  b.i,  b.j,  b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); }
+
+  template<typename T> __forceinline Vec3<T>       operator *( const QuaternionT<T>& a, const Vec3<T>      & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) {
+    return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k,
+                          a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j,
+                          a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i,
+                          a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r);
+  }
+  template<typename T> __forceinline QuaternionT<T> operator /( const T             & a, const QuaternionT<T>& b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T             & b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); }
+
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T             & b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T             & b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T             & b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T             & b ) { return a = a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); }
+
+  template<typename T, typename M> __forceinline QuaternionT<T>
+  select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(select(m, q.r, p.r),
+                          select(m, q.i, p.i),
+                          select(m, q.j, p.j),
+                          select(m, q.k, p.k));
+  }
+
+
+  template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+
+  template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; }
+  template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Orientation Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz )
+  {
+    if ( vx.x + vy.y + vz.z >= T(zero) )
+    {
+      const T t = T(one) + (vx.x + vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = t*s;
+      i = (vy.z - vz.y)*s;
+      j = (vz.x - vx.z)*s;
+      k = (vx.y - vy.x)*s;
+    }
+    else if ( vx.x >= max(vy.y, vz.z) )
+    {
+      const T t = (T(one) + vx.x) - (vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vy.z - vz.y)*s;
+      i = t*s;
+      j = (vx.y + vy.x)*s;
+      k = (vz.x + vx.z)*s;
+    }
+    else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) )
+    {
+      const T t = (T(one) + vy.y) - (vz.z + vx.x);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vz.x - vx.z)*s;
+      i = (vx.y + vy.x)*s;
+      j = t*s;
+      k = (vy.z + vz.y)*s;
+    }
+    else //if ( vz.z >= max(vy.y, vx.x) )
+    {
+      const T t = (T(one) + vz.z) - (vx.x + vy.y);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vx.y - vy.x)*s;
+      i = (vz.x + vx.z)*s;
+      j = (vy.z + vz.y)*s;
+      k = t*s;
+    }
+  }
+
+  template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll )
+  {
+    const T cya = cos(yaw  *T(0.5f));
+    const T cpi = cos(pitch*T(0.5f));
+    const T cro = cos(roll *T(0.5f));
+    const T sya = sin(yaw  *T(0.5f));
+    const T spi = sin(pitch*T(0.5f));
+    const T sro = sin(roll *T(0.5f));
+    r = cro*cya*cpi + sro*sya*spi;
+    i = cro*cya*spi + sro*sya*cpi;
+    j = cro*sya*cpi - sro*cya*spi;
+    k = sro*cya*cpi - cro*sya*spi;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  //////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) {
+    return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }";
+  }
+
+  /*! default template instantiations */
+  typedef QuaternionT<float>  Quaternion3f;
+  typedef QuaternionT<double> Quaternion3d;
+
+  template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>;
+  typedef QuaternionT<vfloat<4>>  Quaternion3vf4;
+  typedef QuaternionT<vfloat<8>>  Quaternion3vf8;
+  typedef QuaternionT<vfloat<16>> Quaternion3vf16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T>
+  __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0,
+                                   const QuaternionT<T>& q1,
+                                   const T& factor)
+  {
+    QuaternionT<T> q;
+    q.r = lerp(q0.r, q1.r, factor);
+    q.i = lerp(q0.i, q1.i, factor);
+    q.j = lerp(q0.j, q1.j, factor);
+    q.k = lerp(q0.k, q1.k, factor);
+    return q;
+  }
+
+  template<typename T>
+  __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0,
+                                     const QuaternionT<T>& q1_,
+                                     const T& t)
+  {
+    T cosTheta = dot(q0, q1_);
+    QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
+    cosTheta          = select(cosTheta < 0.f, -cosTheta, cosTheta);
+
+    // spherical linear interpolation
+    const T phi = t * fastapprox::acos(cosTheta);
+    T sinPhi, cosPhi;
+    fastapprox::sincos(phi, sinPhi, cosPhi);
+    QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
+    QuaternionT<T> qslerp = msub(cosPhi, q0, qperp);
+
+    // regular linear interpolation as fallback
+    QuaternionT<T> qlerp = normalize(lerp(q0, q1, t));
+
+    return select(cosTheta > 0.9995f, qlerp, qslerp);
+  }
+}
--- a/thirdparty/embree/common/math/range.h
+++ b/thirdparty/embree/common/math/range.h
@@ -0,0 +1,137 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../math/emath.h"
+
+namespace embree
+{
+  template<typename Ty>
+    struct range 
+    {
+      __forceinline range() {}
+
+      __forceinline range(const Ty& begin)
+        : _begin(begin), _end(begin+1) {}
+      
+      __forceinline range(const Ty& begin, const Ty& end)
+        : _begin(begin), _end(end) {}
+ 
+      __forceinline range(const range& other)
+        : _begin(other._begin), _end(other._end) {}
+
+      template<typename T1>
+      __forceinline range(const range<T1>& other)
+        : _begin(Ty(other._begin)), _end(Ty(other._end)) {}
+
+      template<typename T1>
+      __forceinline range& operator =(const range<T1>& other) {
+        _begin = other._begin;
+        _end = other._end;
+        return *this;
+      }
+      
+      __forceinline Ty begin() const {
+        return _begin;
+      }
+      
+      __forceinline Ty end() const {
+	return _end;
+      }
+
+      __forceinline range intersect(const range& r) const {
+        return range (max(_begin,r._begin),min(_end,r._end));
+      }
+
+      __forceinline Ty size() const {
+        return _end - _begin;
+      }
+
+      __forceinline bool empty() const { 
+        return _end <= _begin; 
+      }
+
+      __forceinline Ty center() const {
+        return (_begin + _end)/2;
+      }
+
+      __forceinline std::pair<range,range> split() const 
+      {
+        const Ty _center = center();
+        return std::make_pair(range(_begin,_center),range(_center,_end));
+      }
+
+      __forceinline void split(range& left_o, range& right_o) const 
+      {
+        const Ty _center = center();
+        left_o = range(_begin,_center);
+        right_o = range(_center,_end);
+      }
+
+      __forceinline friend bool operator< (const range& r0, const range& r1) {
+        return r0.size() < r1.size();
+      }
+	
+      friend embree_ostream operator<<(embree_ostream cout, const range& r) {
+        return cout << "range [" << r.begin() << ", " << r.end() << "]";
+      }
+      
+      Ty _begin, _end;
+    };
+
+  template<typename Ty>
+    range<Ty> make_range(const Ty& begin, const Ty& end) {
+    return range<Ty>(begin,end);
+  }
+
+  template<typename Ty>
+    struct extended_range : public range<Ty>
+    {
+      __forceinline extended_range () {}
+
+      __forceinline extended_range (const Ty& begin)
+        : range<Ty>(begin), _ext_end(begin+1) {}
+      
+      __forceinline extended_range (const Ty& begin, const Ty& end)
+        : range<Ty>(begin,end), _ext_end(end) {}
+
+      __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end)
+        : range<Ty>(begin,end), _ext_end(ext_end) {}
+      
+      __forceinline Ty ext_end() const {
+	return _ext_end;
+      }
+
+      __forceinline Ty ext_size() const {
+        return _ext_end - range<Ty>::_begin;
+      }
+
+      __forceinline Ty ext_range_size() const {
+        return _ext_end - range<Ty>::_end;
+      }
+
+      __forceinline bool has_ext_range() const {
+        assert(_ext_end >= range<Ty>::_end);
+        return (_ext_end - range<Ty>::_end) > 0;
+      }
+
+      __forceinline void set_ext_range(const size_t ext_end){
+        assert(ext_end >= range<Ty>::_end);
+        _ext_end = ext_end;
+      }
+
+      __forceinline void move_right(const size_t plus){
+        range<Ty>::_begin   += plus;
+        range<Ty>::_end     += plus;
+        _ext_end += plus;
+      }
+
+      friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) {
+        return cout << "extended_range [" << r.begin() << ", " << r.end() <<  " (" << r.ext_end() << ")]";
+      }
+      
+      Ty _ext_end;
+    };
+}
--- a/thirdparty/embree/common/math/transcendental.h
+++ b/thirdparty/embree/common/math/transcendental.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// Transcendental functions from "ispc": https://github.com/ispc/ispc/
+// Most of the transcendental implementations in ispc code come from
+// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
+
+#include "../simd/simd.h"
+
+namespace embree
+{
+
+namespace fastapprox
+{
+
+template <typename T>
+__forceinline T sin(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto sinUseCos = (kMod4 == 1) | (kMod4 == 3);
+  auto flipSign = (kMod4 > 1);
+
+  // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
+  // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
+  static const float sinC2  = -0.16666667163372039794921875;
+  static const float sinC4  = +8.333347737789154052734375e-3;
+  static const float sinC6  = -1.9842604524455964565277099609375e-4;
+  static const float sinC8  = +2.760012648650445044040679931640625e-6;
+  static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  static const float cosC2  = -0.5;
+  static const float cosC4  = +4.166664183139801025390625e-2;
+  static const float cosC6  = -1.388833043165504932403564453125e-3;
+  static const float cosC8  = +2.47562347794882953166961669921875e-5;
+  static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(sinUseCos, 1., x);
+  auto c2  = select(sinUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(sinUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(sinUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(sinUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline T cos(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+
+  auto kMod4 = k & 3;
+  auto cosUseCos = (kMod4 == 0) | (kMod4 == 2);
+  auto flipSign = (kMod4 == 1) | (kMod4 == 2);
+
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(cosUseCos, 1., x);
+  auto c2  = select(cosUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(cosUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(cosUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(cosUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline void sincos(const T &v, T &sinResult, T &cosResult)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
+  auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
+  auto sinFlipSign = (kMod4 > 1);
+  auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
+
+  const float oneVec = +1.;
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto x2 = x * x;
+
+  auto sinFormula = x2 * sinC10 + sinC8;
+  auto cosFormula = x2 * cosC10 + cosC8;
+  sinFormula = x2 * sinFormula + sinC6;
+  cosFormula = x2 * cosFormula + cosC6;
+
+  sinFormula = x2 * sinFormula + sinC4;
+  cosFormula = x2 * cosFormula + cosC4;
+
+  sinFormula = x2 * sinFormula + sinC2;
+  cosFormula = x2 * cosFormula + cosC2;
+
+  sinFormula = x2 * sinFormula + oneVec;
+  cosFormula = x2 * cosFormula + oneVec;
+
+  sinFormula *= x;
+
+  sinResult = select(sinUseCos, cosFormula, sinFormula);
+  cosResult = select(cosUseCos, cosFormula, sinFormula);
+
+  sinResult = select(sinFlipSign, -sinResult, sinResult);
+  cosResult = select(cosFlipSign, -cosResult, cosResult);
+}
+
+template <typename T>
+__forceinline T tan(const T &v)
+{
+  const float piOverFourVec = 0.785398185253143310546875;
+  const float fourOverPiVec = 1.27323949337005615234375;
+
+  auto xLt0 = v < 0.;
+  auto y = select(xLt0, -v, v);
+  auto scaled = y * fourOverPiVec;
+
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  auto x = y - kReal * piOverFourVec;
+
+  // If k & 1, x -= Pi/4
+  auto needOffset = (k & 1) != 0;
+  x = select(needOffset, x - piOverFourVec, x);
+
+  // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
+  auto kMod4 = k & 3;
+  auto useCotan = (kMod4 == 1) | (kMod4 == 2);
+
+  const float oneVec = 1.0;
+
+  const float tanC2  = +0.33333075046539306640625;
+  const float tanC4  = +0.13339905440807342529296875;
+  const float tanC6  = +5.3348250687122344970703125e-2;
+  const float tanC8  = +2.46033705770969390869140625e-2;
+  const float tanC10 = +2.892402000725269317626953125e-3;
+  const float tanC12 = +9.500005282461643218994140625e-3;
+
+  const float cotC2  = -0.3333333432674407958984375;
+  const float cotC4  = -2.222204394638538360595703125e-2;
+  const float cotC6  = -2.11752182804048061370849609375e-3;
+  const float cotC8  = -2.0846328698098659515380859375e-4;
+  const float cotC10 = -2.548247357481159269809722900390625e-5;
+  const float cotC12 = -3.5257363606433500535786151885986328125e-7;
+
+  auto x2 = x * x;
+  T z;
+  if (any(useCotan))
+  {
+    auto cotVal = x2 * cotC12 + cotC10;
+    cotVal = x2 * cotVal + cotC8;
+    cotVal = x2 * cotVal + cotC6;
+    cotVal = x2 * cotVal + cotC4;
+    cotVal = x2 * cotVal + cotC2;
+    cotVal = x2 * cotVal + oneVec;
+    // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
+    cotVal /= -x;
+    z = cotVal;
+  }
+  auto useTan = !useCotan;
+  if (any(useTan))
+  {
+    auto tanVal = x2 * tanC12 + tanC10;
+    tanVal = x2 * tanVal + tanC8;
+    tanVal = x2 * tanVal + tanC6;
+    tanVal = x2 * tanVal + tanC4;
+    tanVal = x2 * tanVal + tanC2;
+    tanVal = x2 * tanVal + oneVec;
+    // Equation was for tan(x)/x
+    tanVal *= x;
+    z = select(useTan, tanVal, z);
+  }
+  return select(xLt0, -z, z);
+}
+
+template <typename T>
+__forceinline T asin(const T &x0)
+{
+  auto isneg = (x0 < 0.f);
+  auto x = abs(x0);
+  auto isnan = (x > 1.f);
+
+  // sollya
+  // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
+  //           [1e-20;.9999999999999999]);
+  // avg error: 1.1105439e-06, max error 1.3187528e-06
+  auto v = 1.57079517841339111328125f +
+           x * (-0.21450997889041900634765625f +
+                x * (8.78556668758392333984375e-2f +
+                     x * (-4.489909112453460693359375e-2f +
+                          x * (1.928029954433441162109375e-2f +
+                               x * (-4.3095736764371395111083984375e-3f)))));
+
+  v *= -sqrt(1.f - x);
+  v = v + 1.57079637050628662109375f;
+
+  v = select(v < 0.f, T(0.f), v);
+  v = select(isneg, -v, v);
+  v = select(isnan, T(cast_i2f(0x7fc00000)), v);
+
+  return v;
+}
+
+template <typename T>
+__forceinline T acos(const T &v)
+{
+  return 1.57079637050628662109375f - asin(v);
+}
+
+template <typename T>
+__forceinline T atan(const T &v)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  // atan(-x) = -atan(x) (so flip from negative to positive first)
+  // If x > 1 -> atan(x) = Pi/2 - atan(1/x)
+  auto xNeg = v < 0.f;
+  auto xFlipped = select(xNeg, -v, v);
+
+  auto xGt1 = xFlipped > 1.;
+  auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
+
+  // These coefficients approximate atan(x)/x
+  const float atanC0  = +0.99999988079071044921875;
+  const float atanC2  = -0.3333191573619842529296875;
+  const float atanC4  = +0.199689209461212158203125;
+  const float atanC6  = -0.14015688002109527587890625;
+  const float atanC8  = +9.905083477497100830078125e-2;
+  const float atanC10 = -5.93664981424808502197265625e-2;
+  const float atanC12 = +2.417283318936824798583984375e-2;
+  const float atanC14 = -4.6721356920897960662841796875e-3;
+
+  auto x2 = x * x;
+  auto result = x2 * atanC14 + atanC12;
+  result = x2 * result + atanC10;
+  result = x2 * result + atanC8;
+  result = x2 * result + atanC6;
+  result = x2 * result + atanC4;
+  result = x2 * result + atanC2;
+  result = x2 * result + atanC0;
+  result *= x;
+
+  result = select(xGt1, piOverTwoVec - result, result);
+  result = select(xNeg, -result, result);
+  return result;
+}
+
+template <typename T>
+__forceinline T atan2(const T &y, const T &x)
+{
+  const float piVec = 3.1415926536;
+  // atan2(y, x) =
+  //
+  // atan2(y > 0, x = +-0) ->  Pi/2
+  // atan2(y < 0, x = +-0) -> -Pi/2
+  // atan2(y = +-0, x < +0) -> +-Pi
+  // atan2(y = +-0, x >= +0) -> +-0
+  //
+  // atan2(y >= 0, x < 0) ->  Pi + atan(y/x)
+  // atan2(y <  0, x < 0) -> -Pi + atan(y/x)
+  // atan2(y, x > 0) -> atan(y/x)
+  //
+  // and then a bunch of code for dealing with infinities.
+  auto yOverX = y * rcpSafe(x);
+  auto atanArg = atan(yOverX);
+  auto xLt0 = x < 0.f;
+  auto yLt0 = y < 0.f;
+  auto offset = select(xLt0,
+                select(yLt0, T(-piVec), T(piVec)), 0.f);
+  return offset + atanArg;
+}
+
+template <typename T>
+__forceinline T exp(const T &v)
+{
+  const float ln2Part1 = 0.6931457519;
+  const float ln2Part2 = 1.4286067653e-6;
+  const float oneOverLn2 = 1.44269502162933349609375;
+
+  auto scaled = v * oneOverLn2;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * ln2Part1;
+  x -= kReal * ln2Part2;
+
+  // These coefficients are for e^x in [0, ln(2)]
+  const float one = 1.;
+  const float c2 = 0.4999999105930328369140625;
+  const float c3 = 0.166668415069580078125;
+  const float c4 = 4.16539050638675689697265625e-2;
+  const float c5 = 8.378830738365650177001953125e-3;
+  const float c6 = 1.304379315115511417388916015625e-3;
+  const float c7 = 2.7555381529964506626129150390625e-4;
+
+  auto result = x * c7 + c6;
+  result = x * result + c5;
+  result = x * result + c4;
+  result = x * result + c3;
+  result = x * result + c2;
+  result = x * result + one;
+  result = x * result + one;
+
+  // Compute 2^k (should differ for float and double, but I'll avoid
+  // it for now and just do floats)
+  const int fpbias = 127;
+  auto biasedN = k + fpbias;
+  auto overflow = kReal > fpbias;
+  // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+  // we've got underflow. -127 * ln(2) -> -88.02. So the most
+  // negative float input that doesn't result in zero is like -88.
+  auto underflow = kReal <= -fpbias;
+  const int infBits = 0x7f800000;
+  biasedN <<= 23;
+  // Reinterpret this thing as float
+  auto twoToTheN = asFloat(biasedN);
+  // Handle both doubles and floats (hopefully eliding the copy for float)
+  auto elemtype2n = twoToTheN;
+  result *= elemtype2n;
+  result = select(overflow, cast_i2f(infBits), result);
+  result = select(underflow, 0., result);
+  return result;
+}
+
+// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
+// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
+template <typename T, typename R>
+__forceinline void __rangeReduceLog(const T &input,
+                                    T &reduced,
+                                    R &exponent)
+{
+  auto intVersion = asInt(input);
+  // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
+  // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
+  //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
+  // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
+  //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
+
+  //const int exponentMask(0x7F800000)
+  static const int nonexponentMask = 0x807FFFFF;
+
+  // We want the reduced version to have an exponent of -1 which is
+  // -1 + 127 after biasing or 126
+  static const int exponentNeg1 = (126l << 23);
+  // NOTE(boulos): We don't need to mask anything out since we know
+  // the sign bit has to be 0. If it's 1, we need to return infinity/nan
+  // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
+  auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
+
+  auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
+  exponent = offsetExponent - 127;          // get the real value
+
+  // Blend the offset_exponent with the original input (do this in
+  // int for now, until I decide if float can have & and &not)
+  auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
+  reduced = asFloat(blended);
+}
+
+template <typename T> struct ExponentType            { };
+template <int N>      struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; };
+template <>           struct ExponentType<float>     { typedef int     Ty; };
+
+template <typename T>
+__forceinline T log(const T &v)
+{
+  T reduced;
+  typename ExponentType<T>::Ty exponent;
+
+  const int nanBits = 0x7fc00000;
+  const int negInfBits = 0xFF800000;
+  const float nan = cast_i2f(nanBits);
+  const float negInf = cast_i2f(negInfBits);
+  auto useNan = v < 0.;
+  auto useInf = v == 0.;
+  auto exceptional = useNan | useInf;
+  const float one = 1.0;
+
+  auto patched = select(exceptional, one, v);
+  __rangeReduceLog(patched, reduced, exponent);
+
+  const float ln2 = 0.693147182464599609375;
+
+  auto x1 = one - reduced;
+  const float c1 = +0.50000095367431640625;
+  const float c2 = +0.33326041698455810546875;
+  const float c3 = +0.2519190013408660888671875;
+  const float c4 = +0.17541764676570892333984375;
+  const float c5 = +0.3424419462680816650390625;
+  const float c6 = -0.599632322788238525390625;
+  const float c7 = +1.98442304134368896484375;
+  const float c8 = -2.4899270534515380859375;
+  const float c9 = +1.7491014003753662109375;
+
+  auto result = x1 * c9 + c8;
+  result = x1 * result + c7;
+  result = x1 * result + c6;
+  result = x1 * result + c5;
+  result = x1 * result + c4;
+  result = x1 * result + c3;
+  result = x1 * result + c2;
+  result = x1 * result + c1;
+  result = x1 * result + one;
+
+  // Equation was for -(ln(red)/(1-red))
+  result *= -x1;
+  result += toFloat(exponent) * ln2;
+
+  return select(exceptional,
+                select(useNan, T(nan), T(negInf)),
+                result);
+}
+
+template <typename T>
+__forceinline T pow(const T &x, const T &y)
+{
+  auto x1 = abs(x);
+  auto z = exp(y * log(x1));
+
+  // Handle special cases
+  const float twoOver23 = 8388608.0f;
+  auto yInt = y == round(y);
+  auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
+
+  // x == 0
+  z = select(x == 0.0f,
+      select(y < 0.0f, T(inf) | signmsk(x),
+      select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
+
+  // x < 0
+  auto xNegative = x < 0.0f;
+  if (any(xNegative))
+  {
+    auto z1 = z | asFloat(yOddInt);
+    z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
+    z = select(xNegative, z1, z);
+  }
+
+  auto xFinite = isfinite(x);
+  auto yFinite = isfinite(y);
+  if (all(xFinite & yFinite))
+    return z;
+
+  // x finite and y infinite
+  z = select(andn(xFinite, yFinite),
+      select(x1 == 1.0f, 1.0f,
+      select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
+
+  // x infinite
+  z = select(xFinite, z,
+      select(y == 0.0f, 1.0f,
+      select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
+
+  return z;
+}
+
+template <typename T>
+__forceinline T pow(const T &x, float y)
+{
+  return pow(x, T(y));
+}
+
+} // namespace fastapprox
+
+} // namespace embree
--- a/thirdparty/embree/common/math/vec2.h
+++ b/thirdparty/embree/common/math/vec2.h
@@ -0,0 +1,236 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "emath.h"
+
+namespace embree
+{
+  struct Vec2fa;
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 2D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec2
+  {
+    enum { N = 2 };
+    union {
+      struct { T x, y; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ) {}
+    __forceinline explicit Vec2( const T& a             ) : x(a), y(a) {}
+    __forceinline          Vec2( const T& x, const T& y ) : x(x), y(y) {}
+
+    __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; }
+    Vec2( const Vec2fa& other );
+
+    template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {}
+    template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ZeroTy   ) : x(zero), y(zero) {}
+    __forceinline Vec2( OneTy    ) : x(one),  y(one) {}
+    __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {}
+    __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {}
+
+#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 2); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis )      { assert(axis < 2); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); }
+  template<typename T> __forceinline Vec2<T> abs       ( const Vec2<T>& a ) { return Vec2<T>(abs  (a.x), abs  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rcp       ( const Vec2<T>& a ) { return Vec2<T>(rcp  (a.x), rcp  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rsqrt     ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); }
+  template<typename T> __forceinline Vec2<T> sqrt      ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); }
+  template<typename T> __forceinline Vec2<T> frac      ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); }
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x + b  , a.y + b  ); }
+  template<typename T> __forceinline Vec2<T> operator +( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   + b.x, a   + b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x - b  , a.y - b  ); }
+  template<typename T> __forceinline Vec2<T> operator -( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   - b.x, a   - b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   * b.x, a   * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x * b  , a.y * b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x / b  , a.y / b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   / b.x, a   / b.y); }
+
+  template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); }
+  template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> madd  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); }
+
+  template<typename T> __forceinline Vec2<T> madd  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; return a; }
+  template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; }
+  template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; }
+  template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); }
+  template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; }
+  template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; }
+  template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) {
+    return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidean Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
+  template<typename T> __forceinline Vec2<T> cross    ( const Vec2<T>& a )                   { return Vec2<T>(-a.y,a.x); } 
+  template<typename T> __forceinline T       length   ( const Vec2<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline T       det      ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; }
+
+  template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) );
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T>
+    __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) {
+    return madd(Vec2<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec2<T>& a )
+  {
+    const Vec2<T> b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec2<bool > Vec2b;
+  typedef Vec2<int  > Vec2i;
+  typedef Vec2<float> Vec2f;
+}
+
+#include "vec2fa.h"
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined(__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+}
--- a/thirdparty/embree/common/math/vec2fa.h
+++ b/thirdparty/embree/common/math/vec2fa.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "vec2fa_sycl.h"
+#else
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec2fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec2fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 2 };
+    union {
+      __m128 m128;
+      struct { float x,y,az,aw; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ) {}
+    __forceinline Vec2fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; }
+    __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2fa            ( const Vec2fa& other ) { m128 = other.m128; }
+    __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
+
+    __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec2fa load( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline Vec2fa loadu( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
+      _mm_storeu_ps((float*)ptr,v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec2fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 2); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
+  __forceinline Vec2fa operator -( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa abs  ( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa sign ( const Vec2fa& a ) {
+    return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
+  }
+
+  __forceinline Vec2fa rcp  ( const Vec2fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 reciprocal = _mm_rcp_ps(a.m128);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        return (const Vec2fa)reciprocal;
+#else
+#if defined(__AVX512VL__)
+    const Vec2fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec2fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0));  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec2fa res = _mm_fmadd_ps(r, h_n, r);            // Then compute r + r * h_n
+#else
+    const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r));  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n));             // Then compute r + r * h_n  
+#endif
+
+    return res;
+#endif  //defined(__aarch64__)
+  }
+
+  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
+
+  __forceinline Vec2fa rsqrt( const Vec2fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+
+#endif
+  }
+
+  __forceinline Vec2fa zero_fix(const Vec2fa& a) {
+    return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec2fa log ( const Vec2fa& a ) {
+    return Vec2fa(logf(a.x),logf(a.y));
+  }
+
+  __forceinline Vec2fa exp ( const Vec2fa& a ) {
+    return Vec2fa(expf(a.x),expf(a.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
+  __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
+      return Vec2fa(powf(a.x,b),powf(a.y,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec2fa madd  ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa msub  ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
+  __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
+  __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
+  __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }
+  __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
+  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidean Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
+  }
+#else
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec2fa cross ( const Vec2fa& a ) {
+    return Vec2fa(-a.y,a.x);
+  }
+
+  __forceinline float  sqr_length ( const Vec2fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec2fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec2fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec2fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec2fa normalize( const Vec2fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec2fa& a )
+  {
+    const Vec2fa b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
+  __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
+  __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
+#elif defined (__SSE4_1__)
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+#else
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  typedef Vec2fa Vec2fa_t;
+}
+
+#endif
--- a/thirdparty/embree/common/math/vec2fa_sycl.h
+++ b/thirdparty/embree/common/math/vec2fa_sycl.h
@@ -0,0 +1,270 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  struct Vec3fa;
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec2fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec2fa
+  {
+    //ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 2 };
+    struct { float x,y; };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ) {}
+    //__forceinline Vec2fa( const __m128 a ) : m128(a) {}
+    explicit Vec2fa(const Vec3fa& a);
+    
+    __forceinline explicit Vec2fa( const vfloat<4>& a ) {
+      x = a[0];
+      y = a[1];
+    }
+
+    __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; }
+    __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2fa            ( const Vec2fa& other ) { x = other.x; y = other.y; }
+    __forceinline Vec2fa& operator =( const Vec2fa& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline explicit Vec2fa( const float a ) : x(a), y(a) {}
+    __forceinline          Vec2fa( const float x, const float y) : x(x), y(y) {}
+
+    //__forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec2fa load( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec2fa(ptr[0],ptr[1]);
+    }
+
+    static __forceinline Vec2fa loadu( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec2fa(ptr[0],ptr[1]);
+    }
+
+    static __forceinline void storeu ( void* a, const Vec2fa& v ) {
+      float* ptr = (float*)a;
+      ptr[0] = v.x; ptr[1] = v.y;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ZeroTy   ) : x(0.0f), y(0.0f) {}
+    __forceinline Vec2fa( OneTy    ) : x(1.0f), y(1.0f) {}
+    __forceinline Vec2fa( PosInfTy ) : x(+INFINITY), y(+INFINITY) {}
+    __forceinline Vec2fa( NegInfTy ) : x(-INFINITY), y(-INFINITY) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    //__forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
+    //__forceinline       float& operator []( const size_t index )       { assert(index < 2); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
+  __forceinline Vec2fa operator -( const Vec2fa& a ) { return Vec2fa(-a.x,-a.y); }
+  __forceinline Vec2fa abs  ( const Vec2fa& a ) { return Vec2fa(sycl::fabs(a.x),sycl::fabs(a.y)); }
+  __forceinline Vec2fa sign ( const Vec2fa& a ) { return Vec2fa(sycl::sign(a.x),sycl::sign(a.y)); }
+
+   //__forceinline Vec2fa rcp  ( const Vec2fa& a ) { return Vec2fa(sycl::native::recip(a.x),sycl::native::recip(a.y)); }
+  __forceinline Vec2fa rcp  ( const Vec2fa& a ) { return Vec2fa(sycl::native::recip(a.x),sycl::native::recip(a.y)); }
+  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return Vec2fa(sycl::sqrt(a.x),sycl::sqrt(a.y)); }
+  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return Vec2fa(a.x*a.x,a.y*a.y); }
+  
+  __forceinline Vec2fa rsqrt( const Vec2fa& a ) { return Vec2fa(sycl::rsqrt(a.x),sycl::rsqrt(a.y)); }
+
+  __forceinline Vec2fa zero_fix(const Vec2fa& a) {
+    const float x = sycl::fabs(a.x) < min_rcp_input ? min_rcp_input : a.x;
+    const float y = sycl::fabs(a.y) < min_rcp_input ? min_rcp_input : a.y;
+    return Vec2fa(x,y);
+  }
+  __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec2fa log ( const Vec2fa& a ) {
+    return Vec2fa(sycl::log(a.x),sycl::log(a.y));
+  }
+
+  __forceinline Vec2fa exp ( const Vec2fa& a ) {
+    return Vec2fa(sycl::exp(a.x),sycl::exp(a.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return Vec2fa(a.x+b.x, a.y+b.y); }
+  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return Vec2fa(a.x-b.x, a.y-b.y); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return Vec2fa(a.x*b.x, a.y*b.y); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
+  __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return Vec2fa(a.x/b.x, a.y/b.y); }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return Vec2fa(a.x/b, a.y/b); }
+  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return Vec2fa(a/b.x, a/b.y); }
+
+  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) {
+    return Vec2fa(sycl::fmin(a.x,b.x), sycl::fmin(a.y,b.y));
+  }
+  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) {
+    return Vec2fa(sycl::fmax(a.x,b.x), sycl::fmax(a.y,b.y));
+  }
+
+/*
+#if defined(__SSE4_1__)
+    __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
+      return Vec2fa(powf(a.x,b),powf(a.y,b));
+    }
+*/
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return Vec2fa(madd(a.x,b.x,c.x), madd(a.y,b.y,c.y)); }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return Vec2fa(msub(a.x,b.x,c.x), msub(a.y,b.y,c.y)); }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return Vec2fa(nmadd(a.x,b.x,c.x), nmadd(a.y,b.y,c.y)); }
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return Vec2fa(nmsub(a.x,b.x,c.x), nmsub(a.y,b.y,c.y)); }
+
+  __forceinline Vec2fa madd  ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa msub  ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
+  __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
+  __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
+  __forceinline float reduce_min(const Vec2fa& v) { return sycl::fmin(v.x,v.y); }
+  __forceinline float reduce_max(const Vec2fa& v) { return sycl::fmax(v.x,v.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return a.x == b.x && a.y == b.y; }
+  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return a.x != b.x || a.y != b.y; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline Vec2fa cross ( const Vec2fa& a ) {
+    return Vec2fa(-a.y,a.x);
+  }
+
+  __forceinline float  sqr_length ( const Vec2fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec2fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec2fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec2fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec2fa normalize( const Vec2fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
+    return Vec2fa(s ? t.x : f.x, s ? t.y : f.y);
+  }
+
+  __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec2fa& a )
+  {
+    const Vec2fa b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(sycl::trunc(a.x),sycl::trunc(a.y)); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(sycl::floor(a.x),sycl::floor(a.y)); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(sycl::ceil (a.x),sycl::ceil (a.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  inline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  /*template<>
+  __forceinline vfloat_impl<4>::vfloat_impl(const Vec2fa& a)
+  {
+    v = 0;
+    const unsigned int lid = get_sub_group_local_id();
+    if (lid == 0) v = a.x;
+    if (lid == 1) v = a.y;
+  }*/
+
+  typedef Vec2fa Vec2fa_t;
+}
--- a/thirdparty/embree/common/math/vec3.h
+++ b/thirdparty/embree/common/math/vec3.h
@@ -0,0 +1,357 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "emath.h"
+
+namespace embree
+{
+  struct Vec3fa;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 3D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec3
+  {
+    enum { N  = 3 };
+
+    union {
+      struct {
+	T x, y, z;
+      };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ) {}
+    __forceinline explicit Vec3( const T& a                         ) : x(a), y(a), z(a) {}
+    __forceinline          Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {}
+
+    __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; }
+    __forceinline Vec3( const Vec3fa& other );
+
+    template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {}
+    template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; }
+	
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ZeroTy   ) : x(zero), y(zero), z(zero) {}
+    __forceinline Vec3( OneTy    ) : x(one),  y(one),  z(one) {}
+    __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {}
+    __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+    __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; }
+    __forceinline       T& operator []( const size_t axis )       { assert(axis < 3); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 3); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); }
+  template<typename T> __forceinline Vec3<T> abs       ( const Vec3<T>& a ) { return Vec3<T>(abs  (a.x), abs  (a.y), abs  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rcp       ( const Vec3<T>& a ) { return Vec3<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rsqrt     ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); }
+  template<typename T> __forceinline Vec3<T> sqrt      ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); }
+
+  template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a )
+  {
+    return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x),
+                   select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y),
+                   select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z));
+  }
+  template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   * b.x, a   * b.y, a   * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x * b  , a.y * b  , a.z * b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x / b  , a.y / b  , a.z / b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   / b.x, a   / b.y, a   / b.z); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); }
+
+  template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); }
+  template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); }
+
+  template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); }
+  template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> madd  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); }
+
+  template<typename T> __forceinline Vec3<T> madd  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T        b ) { a.x += b;   a.y += b;   a.z += b;   return a; }
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; return a; }
+  template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; }
+  template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; }
+  template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); }
+  template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+  template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+  template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) {
+    return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T>
+    __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) {
+    return madd(Vec3<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec3<T>& a )
+  {
+    const Vec3<T> b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); }
+  template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); }
+  template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); }
+  template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); }
+  template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); }
+  template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidean Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       sqr      ( const Vec3<T>& a )                   { return dot(a,a); }
+  template<typename T> __forceinline T       dot      ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); }
+  template<typename T> __forceinline T       length   ( const Vec3<T>& a )                   { return sqrt(sqr(a)); }
+  template<typename T> __forceinline T       rcp_length( const Vec3<T>& a )                  { return rsqrt(sqr(a)); }
+  template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a )                   { return a*rsqrt(sqr(a)); }
+  template<typename T> __forceinline T       distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline Vec3<T> cross    ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); }
+  template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
+  {
+    const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
+    const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x;
+    const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z));
+    const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z));
+    const auto sx = abs(ab_x) < abs(bc_x);
+    const auto sy = abs(ab_y) < abs(bc_y);
+    const auto sz = abs(ab_z) < abs(bc_z);
+    return Vec3<T>(select(sx,cross_ab.x,cross_bc.x),
+                   select(sy,cross_ab.y,cross_bc.y),
+                   select(sz,cross_ab.z,cross_bc.z));
+  }
+
+  template<typename T> __forceinline T       sum      ( const Vec3<T>& a )                   { return a.x+a.y+a.z; }
+
+  template<typename T> __forceinline      T  halfArea ( const Vec3<T>& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  template<typename T> __forceinline      T  area     ( const Vec3<T>& d )                  { return 2.0f*halfArea(d); }
+
+  template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ), a ,  a*rsqrt(d) );
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1)
+  {
+    const Vec3<T> N = cross(P-Q0,Q1-Q0);
+    const Vec3<T> D = Q1-Q0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0)
+  {
+    const Vec3<T> N = cross(PmQ0,Q1mQ0);
+    const Vec3<T> D = Q1mQ0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3<bool > Vec3b;
+  typedef Vec3<int  > Vec3i;
+  typedef Vec3<float> Vec3f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined(__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<typename Out, typename In>
+  __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) {
+    return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k]));
+  }
+
+  template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; }
+
+#if !defined(__SYCL_DEVICE_ONLY__)
+
+#if defined(__AVX__)
+  template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#elif defined(__SSE__) || defined(__ARM_NEON)
+  template<>
+  __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
+  }
+#endif
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  template<>
+  __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) {
+    return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) {
+    return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {}
+#endif
+  
+#else
+
+#if defined(__SSE__)
+  template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#endif
+#if defined(__AVX__)
+  template<> __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#endif
+#if defined(__AVX512F__)
+  template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#endif
+#endif
+}
--- a/thirdparty/embree/common/math/vec3ba.h
+++ b/thirdparty/embree/common/math/vec3ba.h
@@ -0,0 +1,127 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "vec3ba_sycl.h"
+#else
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ba Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ba
+  {
+    ALIGNED_STRUCT_(16);
+    
+    union {
+      __m128 m128;
+      struct { int x,y,z; };
+    };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( ) {}
+    __forceinline Vec3ba( const __m128  input ) : m128(input) {}
+    __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {}
+    __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3ba( bool a )
+      : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+    __forceinline Vec3ba( bool a, bool b, bool c)
+      : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3ba( TrueTy  ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); }
+  __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); }
+  __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; }
+  __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; }
+  __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; 
+  }
+  __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; 
+  }
+  __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; }
+  __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; }
+
+  __forceinline bool all       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; }
+  __forceinline bool any       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; }
+  __forceinline bool none      ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; }
+
+  __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) {
+    return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")";
+  }
+}
+
+#endif
--- a/thirdparty/embree/common/math/vec3ba_sycl.h
+++ b/thirdparty/embree/common/math/vec3ba_sycl.h
@@ -0,0 +1,115 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ba Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ba
+  {
+    //ALIGNED_STRUCT_(16);
+    
+    struct { bool x,y,z; };
+
+    typedef bool Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( ) {}
+    //__forceinline Vec3ba( const __m128  input ) : m128(input) {}
+
+    __forceinline Vec3ba( const Vec3ba& other ) : x(other.x), y(other.y), z(other.z) {}
+    __forceinline Vec3ba& operator =(const Vec3ba& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline explicit Vec3ba( bool a ) : x(a), y(a), z(a) {}
+    __forceinline Vec3ba( bool a, bool b, bool c) : x(a), y(b), z(c) {}
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( FalseTy ) : x(false), y(false), z(false) {}
+    __forceinline Vec3ba( TrueTy  ) : x(true),  y(true),  z(true)  {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    //__forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    //__forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator !( const Vec3ba& a ) { return Vec3ba(!a.x,!a.y,!a.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return Vec3ba(a.x & b.x, a.y & b.y, a.z & b.z); }
+  __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return Vec3ba(a.x | b.x, a.y | b.y, a.z | b.z); }
+  __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return Vec3ba(a.x != b.x, a.y != b.y, a.z != b.z); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; }
+  __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; }
+  __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) {
+    return a.x == b.x && a.y == b.y && a.z == b.z;
+  }
+  __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) {
+    return a.x != b.x || a.y != b.y || a.z != b.z;
+  }
+/*
+  __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+*/
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and( const Vec3ba& a ) { return a.x & a.y & a.z; }
+  __forceinline bool reduce_or ( const Vec3ba& a ) { return a.x | a.y | a.z; }
+
+  __forceinline bool all       ( const Vec3ba& b ) { return reduce_and(b); }
+  __forceinline bool any       ( const Vec3ba& b ) { return reduce_or(b); }
+  __forceinline bool none      ( const Vec3ba& b ) { return !reduce_or(b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  inline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) {
+    return cout;
+  }
+}
--- a/thirdparty/embree/common/math/vec3fa.h
+++ b/thirdparty/embree/common/math/vec3fa.h
@@ -0,0 +1,791 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "vec3fa_sycl.h"
+#else
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    union {
+      __m128 m128;
+      struct { float x,y,z; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ) {}
+    __forceinline Vec3fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec3fa            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+    __forceinline Vec3fa            ( const Vec3fa& other ) { m128 = other.m128; }
+    __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+    __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fa load( const void* const a ) {
+#if defined(__aarch64__)
+        __m128 t = _mm_load_ps((float*)a);
+        t[3] = 0.0f;
+        return Vec3fa(t);
+#else
+      return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+#endif
+    }
+
+    static __forceinline Vec3fa loadu( const void* const a ) {
+      return Vec3fa(_mm_loadu_ps((float*)a));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
+      _mm_storeu_ps((float*)ptr,v.m128);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
+  __forceinline Vec3fa operator -( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    return vnegq_f32(a.m128);
+#else
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+#endif
+  }
+  __forceinline Vec3fa abs  ( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    return _mm_abs_ps(a.m128);
+#else
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+#endif
+  }
+  __forceinline Vec3fa sign ( const Vec3fa& a ) {
+    return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
+  }
+
+  __forceinline Vec3fa rcp  ( const Vec3fa& a )
+  {
+#if defined(__aarch64__)
+  return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
+#else
+
+#if defined(__AVX512VL__)
+    const Vec3fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec3fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0));  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128);       // Then compute r + r * h_n
+#else
+    const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128));  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128));        // Then compute r + r * h_n  
+#endif
+
+    return res;
+#endif  //defined(__aarch64__)
+  }
+
+  __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec3fa sqr  ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+  __forceinline Vec3fa rsqrt( const Vec3fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
+  }
+
+  __forceinline Vec3fa zero_fix(const Vec3fa& a) {
+    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fa log ( const Vec3fa& a ) {
+    return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
+  }
+
+  __forceinline Vec3fa exp ( const Vec3fa& a ) {
+    return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
+  __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec3fa operator /( const        float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
+      return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
+#endif
+
+  __forceinline Vec3fa madd  ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa msub  ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
+  __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+  __forceinline float reduce_add(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+    t[3] = 0.0f;
+    return vaddvq_f32(t);
+  }
+
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+      t[3] = t[2];
+    return vminvq_f32(t);
+  }
+  __forceinline float reduce_max(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+      t[3] = t[2];
+    return vmaxvq_f32(t);
+  }
+#else
+  __forceinline float reduce_add(const Vec3fa& v) {
+    const vfloat4 a(v.m128);
+    const vfloat4 b = shuffle<1>(a);
+    const vfloat4 c = shuffle<2>(a);
+    return _mm_cvtss_f32(a+b+c); 
+  }
+
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
+  __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+  __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+  __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+ #if defined(__aarch64__)
+  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
+#else
+  __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+#endif
+
+  __forceinline bool isvalid ( const Vec3fa& v ) {
+    return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fa& a ) {
+    return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fa& v ) {
+    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fa& a ) {
+    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidean Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+  }
+#else
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
+  {
+    vfloat4 a0 = vfloat4(a.m128);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+    vfloat4 b1 = vfloat4(b.m128);
+    return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+  }
+
+  __forceinline float  sqr_length ( const Vec3fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fa normalize( const Vec3fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fa& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fa& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f.m128, t.m128, mask);
+  }
+
+  __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
+    return blendv_ps(f.m128, t.m128, s);
+  }
+
+  __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fa& a )
+  {
+    const Vec3fa b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
+  __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
+  __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
+#elif defined (__SSE4_1__)
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+#else
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3fa Vec3fa_t;
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fx Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fx
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    union {
+      __m128 m128;
+      struct { float x,y,z; union { int a; unsigned u; float w; }; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ) {}
+    __forceinline Vec3fx( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
+    __forceinline operator Vec3fa () const { return Vec3fa(m128); }
+        
+    __forceinline explicit Vec3fx            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+    __forceinline Vec3fx            ( const Vec3fx& other ) { m128 = other.m128; }
+    __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+    __forceinline Vec3fx( const Vec3fa& other, const int      a1) { m128 = other.m128; a = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const float    w1) {
+#if defined (__aarch64__)
+      m128 = other.m128; m128[3] = w1;
+#elif defined (__SSE4_1__)
+      m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
+#else
+      const vint4 mask(-1,-1,-1,0);
+      m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
+#endif
+    }
+    //__forceinline Vec3fx( const float x, const float y, const float z, const int      a) : x(x), y(y), z(z), a(a) {} // not working properly!
+    //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
+    __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
+    
+    //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fx load( const void* const a ) {
+      return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+    }
+
+    static __forceinline Vec3fx loadu( const void* const a ) {
+      return Vec3fx(_mm_loadu_ps((float*)a));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
+      _mm_storeu_ps((float*)ptr,v.m128);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3fx( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
+  __forceinline Vec3fx operator -( const Vec3fx& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec3fx abs  ( const Vec3fx& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec3fx sign ( const Vec3fx& a ) {
+    return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
+  }
+
+  __forceinline Vec3fx rcp  ( const Vec3fx& a )
+  {
+#if defined(__AVX512VL__)
+    const Vec3fx r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec3fx r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+    const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+  }
+
+  __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec3fx sqr  ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+  __forceinline Vec3fx rsqrt( const Vec3fx& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+
+  __forceinline Vec3fx zero_fix(const Vec3fx& a) {
+    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fx log ( const Vec3fx& a ) {
+    return Vec3fx(logf(a.x),logf(a.y),logf(a.z));
+  }
+
+  __forceinline Vec3fx exp ( const Vec3fx& a ) {
+    return Vec3fx(expf(a.x),expf(a.y),expf(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
+  __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec3fx operator /( const        float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__) || defined(__aarch64__)
+    __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__) || defined(__aarch64__)
+    __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
+      return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec3fx madd  ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx msub  ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
+  __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fx& v) { 
+    const vfloat4 a(v.m128);
+    const vfloat4 b = shuffle<1>(a);
+    const vfloat4 c = shuffle<2>(a);
+    return _mm_cvtss_f32(a+b+c); 
+  }
+
+  __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }
+  __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+  __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+  __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+  __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+
+  __forceinline bool isvalid ( const Vec3fx& v ) {
+    return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fx& a ) {
+    return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fx& v ) {
+    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fx& a ) {
+    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidean Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+  }
+#else
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
+  {
+    vfloat4 a0 = vfloat4(a.m128);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+    vfloat4 b1 = vfloat4(b.m128);
+    return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+  }
+
+  __forceinline float  sqr_length ( const Vec3fx& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fx& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fx& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fx& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fx normalize( const Vec3fx& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fx& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fx& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f.m128, t.m128, mask);
+  }
+
+  __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
+    return blendv_ps(f.m128, t.m128, s);
+  }
+
+  __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fx& a )
+  {
+    const Vec3fx b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); }
+  __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); }
+  __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); }
+#elif defined (__SSE4_1__)
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+#else
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  
+  typedef Vec3fx Vec3ff;
+}
+
+#endif
--- a/thirdparty/embree/common/math/vec3fa_sycl.h
+++ b/thirdparty/embree/common/math/vec3fa_sycl.h
@@ -0,0 +1,625 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fa
+  {
+    //ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    struct { float x,y,z, do_not_use; };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ) {}
+    //__forceinline Vec3fa( const __m128 a ) : m128(a) {}
+    //__forceinline explicit Vec3fa(const vfloat4& a) : x(a[0]), y(a[1]), z(a[2]) {}
+
+    __forceinline Vec3fa            ( const Vec3<float>& other  ) { x = other.x; y = other.y; z = other.z; }
+    //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline Vec3fa            ( const Vec3fa& other ) { x = other.x; y = other.y; z = other.z; }
+    __forceinline Vec3fa& operator =( const Vec3fa& other ) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline explicit Vec3fa( const float a ) : x(a), y(a), z(a) {}
+    __forceinline          Vec3fa( const float x, const float y, const float z) : x(x), y(y), z(z) {}
+
+    __forceinline explicit Vec3fa( const Vec3ia& a ) : x((float)a.x), y((float)a.y), z((float)a.z) {}
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+    __forceinline operator vfloat4() const { return vfloat4(x,y,z,0.0f); } // FIXME: we should not need this!!
+
+    //friend __forceinline Vec3fa copy_a( const Vec3fa& a, const Vec3fa& b ) { Vec3fa c = a; c.a = b.a; return c; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fa load( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec3fa(ptr[0],ptr[1],ptr[2]);
+    }
+
+    static __forceinline Vec3fa loadu( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec3fa(ptr[0],ptr[1],ptr[2]);
+    }
+
+    static __forceinline void storeu ( void* a, const Vec3fa& v ) {
+      float* ptr = (float*)a;
+      ptr[0] = v.x; ptr[1] = v.y; ptr[2] = v.z;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ZeroTy   ) : x(0.0f), y(0.0f), z(0.0f) {}
+    __forceinline Vec3fa( OneTy    ) : x(1.0f), y(1.0f), z(1.0f) {}
+    __forceinline Vec3fa( PosInfTy ) : x(+INFINITY), y(+INFINITY), z(+INFINITY) {}
+    __forceinline Vec3fa( NegInfTy ) : x(-INFINITY), y(-INFINITY), z(-INFINITY) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
+  __forceinline Vec3fa operator -( const Vec3fa& a ) { return Vec3fa(-a.x,-a.y,-a.z); }
+  __forceinline Vec3fa abs  ( const Vec3fa& a ) { return Vec3fa(sycl::fabs(a.x),sycl::fabs(a.y),sycl::fabs(a.z)); }
+  __forceinline Vec3fa sign ( const Vec3fa& a ) { return Vec3fa(sycl::sign(a.x),sycl::sign(a.y),sycl::sign(a.z)); }
+
+  //__forceinline Vec3fa rcp  ( const Vec3fa& a ) { return Vec3fa(sycl::recip(a.x),sycl::recip(a.y),sycl::recip(a.z)); }
+  __forceinline Vec3fa rcp  ( const Vec3fa& a ) { return Vec3fa(sycl::native::recip(a.x),sycl::native::recip(a.y),sycl::native::recip(a.z)); }
+  __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return Vec3fa(sycl::sqrt(a.x),sycl::sqrt(a.y),sycl::sqrt(a.z)); }
+  __forceinline Vec3fa sqr  ( const Vec3fa& a ) { return Vec3fa(a.x*a.x,a.y*a.y,a.z*a.z); }
+
+  __forceinline Vec3fa rsqrt( const Vec3fa& a ) { return Vec3fa(sycl::rsqrt(a.x),sycl::rsqrt(a.y),sycl::rsqrt(a.z)); }
+
+  __forceinline Vec3fa zero_fix(const Vec3fa& a) {
+    const float x = sycl::fabs(a.x) < min_rcp_input ? min_rcp_input : a.x;
+    const float y = sycl::fabs(a.y) < min_rcp_input ? min_rcp_input : a.y;
+    const float z = sycl::fabs(a.z) < min_rcp_input ? min_rcp_input : a.z;
+    return Vec3fa(x,y,z);
+  }
+  __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fa log ( const Vec3fa& a ) {
+    return Vec3fa(sycl::log(a.x),sycl::log(a.y),sycl::log(a.z));
+  }
+
+  __forceinline Vec3fa exp ( const Vec3fa& a ) {
+    return Vec3fa(sycl::exp(a.x),sycl::exp(a.y),sycl::exp(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return Vec3fa(a.x+b.x, a.y+b.y, a.z+b.z); }
+  __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return Vec3fa(a.x-b.x, a.y-b.y, a.z-b.z); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return Vec3fa(a.x*b.x, a.y*b.y, a.z*b.z); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
+  __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return Vec3fa(a.x/b.x, a.y/b.y, a.z/b.z); }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const float b        ) { return Vec3fa(a.x/b, a.y/b, a.z/b); }
+  __forceinline Vec3fa operator /( const        float a, const Vec3fa& b ) { return Vec3fa(a/b.x, a/b.y, a/b.z); }
+
+  __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) {
+    return Vec3fa(sycl::fmin(a.x,b.x), sycl::fmin(a.y,b.y), sycl::fmin(a.z,b.z));
+  }
+  __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) {
+    return Vec3fa(sycl::fmax(a.x,b.x), sycl::fmax(a.y,b.y), sycl::fmax(a.z,b.z));
+  }
+
+/*
+#if defined(__SSE4_1__)
+    __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+*/
+  __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
+    return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return Vec3fa(madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return Vec3fa(msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return Vec3fa(nmadd(a.x,b.x,c.x), nmadd(a.y,b.y,c.y), nmadd(a.z,b.z,c.z)); }
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return Vec3fa(nmsub(a.x,b.x,c.x), nmsub(a.y,b.y,c.y), nmsub(a.z,b.z,c.z)); }
+
+  __forceinline Vec3fa madd  ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa msub  ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
+  __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fa& v) { return v.x+v.y+v.z; }
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) { return sycl::fmin(sycl::fmin(v.x,v.y),v.z); }
+  __forceinline float reduce_max(const Vec3fa& v) { return sycl::fmax(sycl::fmax(v.x,v.y),v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+  __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x == b.x, a.y == b.y, a.z == b.z); }
+  __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x != b.x, a.y != b.y, a.z != b.z); }
+  __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x <  b.x, a.y <  b.y, a.z <  b.z); }
+  __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x <= b.x, a.y <= b.y, a.z <= b.z); }
+  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x >  b.x, a.y >  b.y, a.z >  b.z); }
+  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x >= b.x, a.y >= b.y, a.z >= b.z); }
+
+  __forceinline bool isvalid ( const Vec3fa& v ) {
+    return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fa& a ) {
+    return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b ) {
+    return Vec3fa(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x));
+  }
+  
+  __forceinline float  sqr_length ( const Vec3fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fa normalize( const Vec3fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fa& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fa& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
+    return Vec3fa(s ? t.x : f.x, s ? t.y : f.y, s ? t.z : f.z);
+  }
+
+  __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
+    return Vec3fa(s.x ? t.x : f.x, s.y ? t.y : f.y, s.z ? t.z : f.z);
+  }
+  
+  __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fa& a )
+  {
+    const Vec3fa b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(sycl::trunc(a.x),sycl::trunc(a.y),sycl::trunc(a.z)); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(sycl::floor(a.x),sycl::floor(a.y),sycl::floor(a.z)); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(sycl::ceil (a.x),sycl::ceil (a.y),sycl::ceil (a.z)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  inline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  __forceinline Vec2fa::Vec2fa(const Vec3fa& a)
+    : x(a.x), y(a.y) {}
+
+  __forceinline Vec3ia::Vec3ia( const Vec3fa& a )
+    : x((int)a.x), y((int)a.y), z((int)a.z) {}
+
+  typedef Vec3fa Vec3fa_t;
+
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fx Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fx
+  {
+    //ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    struct { float x,y,z; union { int a; unsigned u; float w; }; };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ) {}
+    //__forceinline Vec3fx( const __m128 a ) : m128(a) {}
+    __forceinline explicit Vec3fx(const vfloat4& a) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}
+
+    __forceinline explicit Vec3fx(const Vec3fa& v) : x(v.x), y(v.y), z(v.z), w(0.0f) {}
+    __forceinline operator Vec3fa() const { return Vec3fa(x,y,z); }
+    
+    __forceinline explicit Vec3fx ( const Vec3<float>& other  ) { x = other.x; y = other.y; z = other.z; }
+    //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    //__forceinline Vec3fx            ( const Vec3fx& other ) { *(sycl::float4*)this = *(const sycl::float4*)&other; }
+    //__forceinline Vec3fx& operator =( const Vec3fx& other ) { *(sycl::float4*)this = *(const sycl::float4*)&other; return *this; }
+
+    __forceinline explicit Vec3fx( const float a ) : x(a), y(a), z(a), w(a) {}
+    __forceinline          Vec3fx( const float x, const float y, const float z) : x(x), y(y), z(z), w(z) {}
+
+    __forceinline Vec3fx( const Vec3fa& other, const int      a1) : x(other.x), y(other.y), z(other.z), a(a1) {}
+    __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) : x(other.x), y(other.y), z(other.z), u(a1) {}
+    __forceinline Vec3fx( const Vec3fa& other, const float    w1) : x(other.x), y(other.y), z(other.z), w(w1) {}
+
+    //__forceinline Vec3fx( const float x, const float y, const float z, const int      a) : x(x), y(y), z(z), a(a) {} // not working properly!
+    //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
+    __forceinline Vec3fx( const float x, const float y, const float z, const float w) : x(x), y(y), z(z), w(w) {}
+
+    __forceinline explicit Vec3fx( const Vec3ia& a ) : x((float)a.x), y((float)a.y), z((float)a.z), w(0.0f) {}
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+    __forceinline operator vfloat4() const { return vfloat4(x,y,z,w); }
+
+    //friend __forceinline Vec3fx copy_a( const Vec3fx& a, const Vec3fx& b ) { Vec3fx c = a; c.a = b.a; return c; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fx load( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec3fx(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+    static __forceinline Vec3fx loadu( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec3fx(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+    static __forceinline void storeu ( void* a, const Vec3fx& v ) {
+      float* ptr = (float*)a;
+      ptr[0] = v.x; ptr[1] = v.y; ptr[2] = v.z; ptr[3] = v.w;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ZeroTy   ) : x(0.0f), y(0.0f), z(0.0f), w(0.0f) {}
+    __forceinline Vec3fx( OneTy    ) : x(1.0f), y(1.0f), z(1.0f), w(1.0f) {}
+    __forceinline Vec3fx( PosInfTy ) : x(+INFINITY), y(+INFINITY), z(+INFINITY), w(+INFINITY) {}
+    __forceinline Vec3fx( NegInfTy ) : x(-INFINITY), y(-INFINITY), z(-INFINITY), w(-INFINITY) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
+  __forceinline Vec3fx operator -( const Vec3fx& a ) { return Vec3fx(-a.x,-a.y,-a.z,-a.w); }
+  __forceinline Vec3fx abs  ( const Vec3fx& a ) { return Vec3fx(sycl::fabs(a.x),sycl::fabs(a.y),sycl::fabs(a.z),sycl::fabs(a.w)); }
+  __forceinline Vec3fx sign ( const Vec3fx& a ) { return Vec3fx(sycl::sign(a.x),sycl::sign(a.y),sycl::sign(a.z),sycl::sign(a.z)); }
+
+  //__forceinline Vec3fx rcp  ( const Vec3fx& a ) { return Vec3fx(sycl::recip(a.x),sycl::recip(a.y),sycl::recip(a.z)); }
+  __forceinline Vec3fx rcp  ( const Vec3fx& a ) { return Vec3fx(sycl::native::recip(a.x),sycl::native::recip(a.y),sycl::native::recip(a.z),sycl::native::recip(a.w)); }
+  __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return Vec3fx(sycl::sqrt(a.x),sycl::sqrt(a.y),sycl::sqrt(a.z),sycl::sqrt(a.w)); }
+  __forceinline Vec3fx sqr  ( const Vec3fx& a ) { return Vec3fx(a.x*a.x,a.y*a.y,a.z*a.z,a.w*a.w); }
+
+  __forceinline Vec3fx rsqrt( const Vec3fx& a ) { return Vec3fx(sycl::rsqrt(a.x),sycl::rsqrt(a.y),sycl::rsqrt(a.z),sycl::rsqrt(a.w)); }
+
+  __forceinline Vec3fx zero_fix(const Vec3fx& a) {
+    const float x = sycl::fabs(a.x) < min_rcp_input ? min_rcp_input : a.x;
+    const float y = sycl::fabs(a.y) < min_rcp_input ? min_rcp_input : a.y;
+    const float z = sycl::fabs(a.z) < min_rcp_input ? min_rcp_input : a.z;
+    return Vec3fx(x,y,z);
+  }
+  __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fx log ( const Vec3fx& a ) {
+    return Vec3fx(sycl::log(a.x),sycl::log(a.y),sycl::log(a.z));
+  }
+
+  __forceinline Vec3fx exp ( const Vec3fx& a ) {
+    return Vec3fx(sycl::exp(a.x),sycl::exp(a.y),sycl::exp(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return Vec3fx(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); }
+  __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return Vec3fx(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return Vec3fx(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
+  __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return Vec3fx(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const float b        ) { return Vec3fx(a.x/b, a.y/b, a.z/b, a.w/b); }
+  __forceinline Vec3fx operator /( const        float a, const Vec3fx& b ) { return Vec3fx(a/b.x, a/b.y, a/b.z, a/b.w); }
+
+  __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) {
+    return Vec3fx(sycl::fmin(a.x,b.x), sycl::fmin(a.y,b.y), sycl::fmin(a.z,b.z), sycl::fmin(a.w,b.w));
+  }
+  __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) {
+    return Vec3fx(sycl::fmax(a.x,b.x), sycl::fmax(a.y,b.y), sycl::fmax(a.z,b.z), sycl::fmax(a.w,b.w));
+  }
+
+/*
+#if defined(__SSE4_1__)
+    __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
+      return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+*/
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return Vec3fx(madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return Vec3fx(msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return Vec3fx(nmadd(a.x,b.x,c.x), nmadd(a.y,b.y,c.y), nmadd(a.z,b.z,c.z), nmadd(a.w,b.w,c.w)); }
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return Vec3fx(nmsub(a.x,b.x,c.x), nmsub(a.y,b.y,c.y), nmsub(a.z,b.z,c.z), nmsub(a.w,b.w,c.w)); }
+
+  __forceinline Vec3fx madd  ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx msub  ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
+  __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fx& v) { return v.x+v.y+v.z; }
+  __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fx& v) { return sycl::fmin(sycl::fmin(v.x,v.y),v.z); }
+  __forceinline float reduce_max(const Vec3fx& v) { return sycl::fmax(sycl::fmax(v.x,v.y),v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+  __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x == b.x, a.y == b.y, a.z == b.z); }
+  __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x != b.x, a.y != b.y, a.z != b.z); }
+  __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x <  b.x, a.y <  b.y, a.z <  b.z); }
+  __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x <= b.x, a.y <= b.y, a.z <= b.z); }
+  __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x >  b.x, a.y >  b.y, a.z >  b.z); }
+  __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x >= b.x, a.y >= b.y, a.z >= b.z); }
+
+  __forceinline bool isvalid ( const Vec3fx& v ) {
+    return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fx& a ) {
+    return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fx& v ) {
+    const bool valid_x = v.x >= -FLT_LARGE & v.x <= +FLT_LARGE;
+    const bool valid_y = v.y >= -FLT_LARGE & v.y <= +FLT_LARGE;
+    const bool valid_z = v.z >= -FLT_LARGE & v.z <= +FLT_LARGE;
+    const bool valid_w = v.w >= -FLT_LARGE & v.w <= +FLT_LARGE;
+    return valid_x & valid_y & valid_z & valid_w;
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fx& v ) {
+    const bool finite_x = v.x >= -FLT_MAX & v.x <= +FLT_MAX;
+    const bool finite_y = v.y >= -FLT_MAX & v.y <= +FLT_MAX;
+    const bool finite_z = v.z >= -FLT_MAX & v.z <= +FLT_MAX;
+    const bool finite_w = v.w >= -FLT_MAX & v.w <= +FLT_MAX;
+    return finite_x & finite_y & finite_z & finite_w;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b ) {
+    return Vec3fx(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x));
+  }
+  
+  __forceinline float  sqr_length ( const Vec3fx& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fx& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fx& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fx& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fx normalize( const Vec3fx& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fx& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fx& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
+    return Vec3fx(s ? t.x : f.x, s ? t.y : f.y, s ? t.z : f.z, s ? t.w : f.w);
+  }
+
+  __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
+    return Vec3fx(s.x ? t.x : f.x, s.y ? t.y : f.y, s.z ? t.z : f.z);
+  }
+  
+  __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fx& a )
+  {
+    const Vec3fx b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(sycl::trunc(a.x),sycl::trunc(a.y),sycl::trunc(a.z),sycl::trunc(a.w)); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(sycl::floor(a.x),sycl::floor(a.y),sycl::floor(a.z),sycl::floor(a.w)); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(sycl::ceil (a.x),sycl::ceil (a.y),sycl::ceil (a.z),sycl::ceil (a.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  inline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << "," << a.w << ")";
+  }
+
+  typedef Vec3fx Vec3ff;
+
+  //__forceinline Vec2fa::Vec2fa(const Vec3fx& a)
+  //  : x(a.x), y(a.y) {}
+
+  //__forceinline Vec3ia::Vec3ia( const Vec3fx& a )
+  //  : x((int)a.x), y((int)a.y), z((int)a.z) {}
+
+}
+
+#if __SYCL_COMPILER_VERSION >= 20210801
+namespace sycl {
+  template<> struct is_device_copyable<embree::Vec3fa> : std::true_type {};
+  template<> struct is_device_copyable<const embree::Vec3fa> : std::true_type {};
+}
+#endif
--- a/thirdparty/embree/common/math/vec3ia.h
+++ b/thirdparty/embree/common/math/vec3ia.h
@@ -0,0 +1,203 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "vec3ia_sycl.h"
+#else
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ia Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ia
+  {
+    ALIGNED_STRUCT_(16);
+
+    union {
+      __m128i m128;
+      struct { int x,y,z; };
+    };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ) {}
+    __forceinline Vec3ia( const __m128i a ) : m128(a) {}
+    __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {}
+    __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {}
+    __forceinline          Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {}
+    __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
+
+    __forceinline operator const __m128i&() const { return m128; }
+    __forceinline operator       __m128i&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ZeroTy   ) : m128(_mm_setzero_si128()) {}
+    __forceinline Vec3ia( OneTy    ) : m128(_mm_set1_epi32(1)) {}
+    __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {}
+    __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
+  __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
+#if (defined(__aarch64__))
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
+#elif defined(__SSSE3__)
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator +( const Vec3ia& a, const int     b ) { return a+Vec3ia(b); }
+  __forceinline Vec3ia operator +( const int     a, const Vec3ia& b ) { return Vec3ia(a)+b; }
+
+  __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
+  __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
+  __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
+#endif
+
+  __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator &( const Vec3ia& a, const int     b ) { return a & Vec3ia(b); }
+  __forceinline Vec3ia operator &( const int     a, const Vec3ia& b ) { return Vec3ia(a) & b; }
+
+  __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator |( const Vec3ia& a, const int     b ) { return a | Vec3ia(b); }
+  __forceinline Vec3ia operator |( const int     a, const Vec3ia& b ) { return Vec3ia(a) | b; }
+
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const int     b ) { return a ^ Vec3ia(b); }
+  __forceinline Vec3ia operator ^( const int     a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
+
+  __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
+  __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
+
+  __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
+  __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
+  __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; }
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const int&   b ) { return a = a + b; }
+  
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
+  
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
+#endif
+  
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; }
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const int&    b ) { return a = a & b; }
+  
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
+  
+#if !defined(__ARM_NEON)
+  __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
+  __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+    return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); 
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+  __forceinline int reduce_add(const Vec3ia& v) { return vaddvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0))); }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) { return vminvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x7FFFFFFF))); }
+  __forceinline int reduce_max(const Vec3ia& v) { return vmaxvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x80000000))); }
+#else
+  __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
+  __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; }
+  __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
+#else
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+}
+
+#endif
--- a/thirdparty/embree/common/math/vec3ia_sycl.h
+++ b/thirdparty/embree/common/math/vec3ia_sycl.h
@@ -0,0 +1,178 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ia Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ia
+  {
+    ALIGNED_STRUCT_(16);
+
+    struct { int x,y,z; };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ) {}
+    //__forceinline Vec3ia( const __m128i a ) : m128(a) {}
+
+    __forceinline Vec3ia( const Vec3ia& other ) : x(other.x), y(other.y), z(other.z) {}
+    __forceinline Vec3ia& operator =(const Vec3ia& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline explicit Vec3ia( const int a ) : x(a), y(a), z(a) {}
+    __forceinline          Vec3ia( const int x, const int y, const int z) : x(x), y(y), z(z) {}
+    //__forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
+    __forceinline explicit Vec3ia(const vint4& a) : x(a[0]), y(a[1]), z(a[2]) {}
+
+    __forceinline explicit Vec3ia( const Vec3fa& a );
+
+    //__forceinline operator const __m128i&() const { return m128; }
+    //__forceinline operator       __m128i&()       { return m128; }
+    __forceinline operator vint4() const { return vint4(x,y,z,z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ZeroTy   ) : x(0), y(0), z(0) {}
+    __forceinline Vec3ia( OneTy    ) : x(1), y(1), z(1) {}
+    __forceinline Vec3ia( PosInfTy ) : x(0x7FFFFFFF), y(0x7FFFFFFF), z(0x7FFFFFFF) {}
+    __forceinline Vec3ia( NegInfTy ) : x(0x80000000), y(0x80000000), z(0x80000000) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a ) { return Vec3ia(+a.x,+a.y,+a.z); }
+  __forceinline Vec3ia operator -( const Vec3ia& a ) { return Vec3ia(-a.x,-a.y,-a.z); }
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return Vec3ia(sycl::abs(a.x),sycl::abs(a.y),sycl::abs(a.z)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x+b.x, a.y+b.y, a.z+b.z); }
+  __forceinline Vec3ia operator +( const Vec3ia& a, const int     b ) { return a+Vec3ia(b); }
+  __forceinline Vec3ia operator +( const int     a, const Vec3ia& b ) { return Vec3ia(a)+b; }
+
+  __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x-b.x, a.y-b.y, a.z-b.z); }
+  __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
+  __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
+
+  __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x*b.x, a.y*b.y, a.z*b.z); }
+  __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
+  __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
+
+  __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x&b.x, a.y&b.y, a.z&b.z); }
+  __forceinline Vec3ia operator &( const Vec3ia& a, const int     b ) { return a & Vec3ia(b); }
+  __forceinline Vec3ia operator &( const int     a, const Vec3ia& b ) { return Vec3ia(a) & b; }
+
+  __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x|b.x, a.y|b.y, a.z|b.z); }
+  __forceinline Vec3ia operator |( const Vec3ia& a, const int     b ) { return a | Vec3ia(b); }
+  __forceinline Vec3ia operator |( const int     a, const Vec3ia& b ) { return Vec3ia(a) | b; }
+
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x^b.x, a.y^b.y, a.z^b.z); }
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const int     b ) { return a ^ Vec3ia(b); }
+  __forceinline Vec3ia operator ^( const int     a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
+
+  __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return Vec3ia(a.x<<n, a.y<<n, a.z<<n); }
+  __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return Vec3ia(a.x>>n, a.y>>n, a.z>>n); }
+
+  __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return Vec3ia(a.x<<b, a.y<<b, a.z<<b); }
+  __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return Vec3ia(a.x>>b, a.y>>b, a.z>>b); }
+  __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return Vec3ia(unsigned(a.x)>>b, unsigned(a.y)>>b, unsigned(a.z)>>b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; }
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const int&   b ) { return a = a + b; }
+  
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
+  
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
+  
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; }
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const int&    b ) { return a = a & b; }
+  
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
+  
+  __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
+  __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) { return sycl::min(sycl::min(v.x,v.y),v.z); }
+  __forceinline int reduce_max(const Vec3ia& v) { return sycl::max(sycl::max(v.x,v.y),v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return a.x == b.x & a.y == b.y & a.z == b.z; }
+  __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return a.x != b.x & a.y != b.y & a.z != b.z; }
+
+/*
+  __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+*/
+  __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return Vec3ba(a.x == b.x, a.y == b.y, a.z == b.z); }
+  __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return Vec3ba(a.x <  b.x, a.y <  b.y, a.z <  b.z); }
+  __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return Vec3ba(a.x >  b.x, a.y >  b.y, a.z >  b.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
+    const int x = m.x ? t.x : f.x;
+    const int y = m.y ? t.y : f.y;
+    const int z = m.z ? t.z : f.z;
+    return Vec3ia(x,y,z);
+  }
+  
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(sycl::min(a.x,b.x), sycl::min(a.y,b.y), sycl::min(a.z,b.z)); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(sycl::max(a.x,b.x), sycl::max(a.y,b.y), sycl::max(a.z,b.z)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  inline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) {
+    return cout;
+  }
+}
--- a/thirdparty/embree/common/math/vec4.h
+++ b/thirdparty/embree/common/math/vec4.h
@@ -0,0 +1,266 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "emath.h"
+#include "vec3.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 4D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec4
+  {
+    enum { N = 4 };    
+    union {
+      struct { T x, y, z, w; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec4( ) {}
+    __forceinline explicit Vec4( const T& a                                     ) : x(a), y(a), z(a), w(a) {}
+    __forceinline          Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {}
+    __forceinline          Vec4( const Vec3<T>& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
+
+    __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; }
+    __forceinline Vec4( const Vec3fx& other );
+
+    template<typename T1> __forceinline Vec4( const Vec4<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {}
+    template<typename T1> __forceinline Vec4& operator =(const Vec4<T1>& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+    __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+    __forceinline operator Vec3<T> () const { return Vec3<T>(x,y,z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec4( ZeroTy   ) : x(zero), y(zero), z(zero), w(zero) {}
+    __forceinline Vec4( OneTy    ) : x(one),  y(one),  z(one),  w(one) {}
+    __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {}
+    __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 4); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)        { assert(axis < 4); return components[axis]; }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Swizzles
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3<T> xyz() const { return Vec3<T>(x, y, z); }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a ) { return Vec4<T>(+a.x, +a.y, +a.z, +a.w); }
+  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a ) { return Vec4<T>(-a.x, -a.y, -a.z, -a.w); }
+  template<typename T> __forceinline Vec4<T> abs       ( const Vec4<T>& a ) { return Vec4<T>(abs  (a.x), abs  (a.y), abs  (a.z), abs  (a.w)); }
+  template<typename T> __forceinline Vec4<T> rcp       ( const Vec4<T>& a ) { return Vec4<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z), rcp  (a.w)); }
+  template<typename T> __forceinline Vec4<T> rsqrt     ( const Vec4<T>& a ) { return Vec4<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); }
+  template<typename T> __forceinline Vec4<T> sqrt      ( const Vec4<T>& a ) { return Vec4<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   * b.x, a   * b.y, a   * b.z, a   * b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x * b  , a.y * b  , a.z * b  , a.w * b  ); }
+  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); }
+  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x / b  , a.y / b  , a.z / b  , a.w / b  ); }
+  template<typename T> __forceinline Vec4<T> operator /( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   / b.x, a   / b.y, a   / b.z, a   / b.w); }
+
+  template<typename T> __forceinline Vec4<T> min(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); }
+  template<typename T> __forceinline Vec4<T> max(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> madd  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> msub  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmadd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmsub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); }
+
+  template<typename T> __forceinline Vec4<T> madd  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> msub  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmadd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmsub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T>& operator +=( Vec4<T>& a, const Vec4<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+  template<typename T> __forceinline Vec4<T>& operator -=( Vec4<T>& a, const Vec4<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+  template<typename T> __forceinline Vec4<T>& operator *=( Vec4<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; a.w *= b  ; return a; }
+  template<typename T> __forceinline Vec4<T>& operator /=( Vec4<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; a.w /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec4<T>& a ) { return a.x + a.y + a.z + a.w; }
+  template<typename T> __forceinline T reduce_mul( const Vec4<T>& a ) { return a.x * a.y * a.z * a.w; }
+  template<typename T> __forceinline T reduce_min( const Vec4<T>& a ) { return min(a.x, a.y, a.z, a.w); }
+  template<typename T> __forceinline T reduce_max( const Vec4<T>& a ) { return max(a.x, a.y, a.z, a.w); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec4<T>& a, const Vec4<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; }
+  template<typename T> __forceinline bool operator !=( const Vec4<T>& a, const Vec4<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; }
+  template<typename T> __forceinline bool operator < ( const Vec4<T>& a, const Vec4<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    if (a.w != b.w) return a.w < b.w;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> shift_right_1( const Vec4<T>& a ) {
+    return Vec4<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidean Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); }
+
+  template<typename T> __forceinline T       length   ( const Vec4<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec4<T> normalize( const Vec4<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec4<T>& a, const Vec4<T>& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> select ( bool s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+  }
+
+  template<typename T> __forceinline Vec4<T> select ( const Vec4<bool>& s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w));
+  }
+
+  template<typename T> __forceinline Vec4<T> select ( const typename T::Bool& s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+  }
+
+  template<typename T>
+    __forceinline Vec4<T> lerp(const Vec4<T>& v0, const Vec4<T>& v1, const T& t) {
+    return madd(Vec4<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec4<bool         > Vec4b;
+  typedef Vec4<unsigned char> Vec4uc;
+  typedef Vec4<int          > Vec4i;
+  typedef Vec4<float        > Vec4f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined __AVX512F__
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; }
+
+#if !defined(__SYCL_DEVICE_ONLY__)
+
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#elif defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
+  }
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat8>::Vec4( const Vec3fx& a ) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {}
+#endif
+
+#else
+
+#if defined(__SSE__)
+  template<> __forceinline Vec4<vfloat4>::Vec4(const Vec3fx& a) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#endif
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat8>::Vec4(const Vec3fx& a) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#endif
+#if defined(__AVX512F__)
+  template<> __forceinline Vec4<vfloat16>::Vec4(const Vec3fx& a) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#endif
+  
+#endif
+}
+
--- a/thirdparty/embree/common/simd/arm/avx2neon.h
+++ b/thirdparty/embree/common/simd/arm/avx2neon.h
--- a/thirdparty/embree/common/simd/arm/emulation.h
+++ b/thirdparty/embree/common/simd/arm/emulation.h
@@ -0,0 +1,87 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+/* Make precision match SSE, at the cost of some performance */
+#if !defined(__aarch64__)
+#  define SSE2NEON_PRECISE_DIV 1
+#  define SSE2NEON_PRECISE_SQRT 1
+#endif
+
+#include "sse2neon.h"
+
+__forceinline __m128 _mm_abs_ps(__m128 a) { return vabsq_f32(a); }
+
+__forceinline __m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c) { return vfmaq_f32(c, a, b); }
+__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { return vfmsq_f32(c, a, b); }
+__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmaq_f32(c, a, b)); }
+__forceinline __m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmsq_f32(c, a, b)); }
+
+__forceinline __m128 _mm_broadcast_ss (float const * mem_addr)
+{
+    return vdupq_n_f32(*mem_addr);
+}
+
+// AVX2 emulation leverages Intel FMA defs above.  Include after them.
+#include "avx2neon.h"
+
+/* Dummy defines for floating point control */
+#define _MM_MASK_MASK 0x1f80
+#define _MM_MASK_DIV_ZERO 0x200
+// #define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_MASK_DENORM 0x100
+#define _MM_SET_EXCEPTION_MASK(x)
+// #define _MM_SET_FLUSH_ZERO_MODE(x)
+
+/*
+__forceinline int _mm_getcsr()
+{
+  return 0;
+}
+
+__forceinline void _mm_mfence()
+{
+  __sync_synchronize();
+}
+*/
+
+__forceinline __m128i _mm_load4epu8_epi32(__m128i *ptr)
+{
+    uint8x8_t  t0 = vld1_u8((uint8_t*)ptr);
+    uint16x8_t t1 = vmovl_u8(t0);
+    uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
+    return vreinterpretq_s32_u32(t2);
+}
+
+__forceinline __m128i _mm_load4epu16_epi32(__m128i *ptr)
+{
+    uint16x8_t t0 = vld1q_u16((uint16_t*)ptr);
+    uint32x4_t t1 = vmovl_u16(vget_low_u16(t0));
+    return vreinterpretq_s32_u32(t1);
+}
+
+__forceinline __m128i _mm_load4epi8_f32(__m128i *ptr)
+{
+    int8x8_t    t0 = vld1_s8((int8_t*)ptr);
+    int16x8_t   t1 = vmovl_s8(t0);
+    int32x4_t   t2 = vmovl_s16(vget_low_s16(t1));
+    float32x4_t t3 = vcvtq_f32_s32(t2);
+    return vreinterpretq_s32_f32(t3);
+}
+
+__forceinline __m128i _mm_load4epu8_f32(__m128i *ptr)
+{
+    uint8x8_t   t0 = vld1_u8((uint8_t*)ptr);
+    uint16x8_t  t1 = vmovl_u8(t0);
+    uint32x4_t  t2 = vmovl_u16(vget_low_u16(t1));
+    return vreinterpretq_s32_u32(t2);
+}
+
+__forceinline __m128i _mm_load4epi16_f32(__m128i *ptr)
+{
+    int16x8_t   t0 = vld1q_s16((int16_t*)ptr);
+    int32x4_t   t1 = vmovl_s16(vget_low_s16(t0));
+    float32x4_t t2 = vcvtq_f32_s32(t1);
+    return vreinterpretq_s32_f32(t2);
+}
--- a/thirdparty/embree/common/simd/arm/sse2neon.h
+++ b/thirdparty/embree/common/simd/arm/sse2neon.h
--- a/thirdparty/embree/common/simd/avx.h
+++ b/thirdparty/embree/common/simd/avx.h
@@ -0,0 +1,33 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "sse.h"
+
+#if defined(__AVX512VL__)
+#include "vboolf8_avx512.h"
+#include "vboold4_avx512.h"
+#else
+#include "vboolf8_avx.h"
+#include "vboold4_avx.h"
+#endif
+
+#if defined(__AVX2__)
+#include "vint8_avx2.h"
+#include "vuint8_avx2.h"
+#if defined(__X86_64__)
+#include "vllong4_avx2.h"
+#endif
+#else
+#include "vint8_avx.h"
+#include "vuint8_avx.h"
+#endif
+#include "vfloat8_avx.h"
+#if defined(__X86_64__)
+#include "vdouble4_avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "avx512.h"
+#endif
--- a/thirdparty/embree/common/simd/avx512.h
+++ b/thirdparty/embree/common/simd/avx512.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../math/constants.h"
+#include "../sys/alloc.h"
+#include "varying.h"
+
+#include "vboolf16_avx512.h"
+#include "vint16_avx512.h"
+#include "vuint16_avx512.h"
+#include "vfloat16_avx512.h"
+
+#include "vboold8_avx512.h"
+#include "vllong8_avx512.h"
+#include "vdouble8_avx512.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Prefetching
+  ////////////////////////////////////////////////////////////////////////////////
+
+#define PFHINT_L1   0
+#define PFHINT_L2   1
+#define PFHINT_NT   2
+
+  template<const unsigned int mode>
+    __forceinline void prefetch(const void * __restrict__ const m)
+  {
+    if (mode == PFHINT_L1)
+      _mm_prefetch((const char*)m,_MM_HINT_T0); 
+    else if (mode == PFHINT_L2) 
+      _mm_prefetch((const char*)m,_MM_HINT_T1); 
+    else if (mode == PFHINT_NT) 
+      _mm_prefetch((const char*)m,_MM_HINT_NTA); 
+  }
+}
--- a/thirdparty/embree/common/simd/simd.h
+++ b/thirdparty/embree/common/simd/simd.h
@@ -0,0 +1,110 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/emath.h"
+
+/* include SSE wrapper classes */
+#if defined(__SSE__) || defined(__ARM_NEON)
+#  include "sse.h"
+#endif
+
+/* include AVX wrapper classes */
+#if defined(__AVX__)
+#  include "avx.h"
+#endif
+
+/* include AVX512 wrapper classes */
+#if defined (__AVX512F__)
+#  include "avx512.h"
+#endif
+
+namespace embree
+{
+  template <int N>
+  __forceinline vbool<N> isfinite(const vfloat<N>& v)
+  {
+    return (v >= vfloat<N>(-std::numeric_limits<float>::max()))
+         & (v <= vfloat<N>( std::numeric_limits<float>::max()));
+  }
+  
+  /* foreach unique */
+  template<typename vbool, typename vint, typename Closure>
+  __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure)
+  {
+    vbool valid1 = valid0;
+    while (any(valid1)) {
+      const int j = int(bsf(movemask(valid1)));
+      const int i = vi[j];
+      const vbool valid2 = valid1 & (i == vi);
+      valid1 = andn(valid1, valid2);
+      closure(valid2, i);
+    }
+  }
+
+  /* returns the next unique value i in vi and the corresponding valid_i mask */
+  template<typename vbool, typename vint>
+  __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+  {
+    assert(any(valid));
+    const int j = int(bsf(movemask(valid)));
+    const int i = vi[j];
+    valid_i = valid & (i == vi);
+    valid = andn(valid, valid_i);
+    return i;
+  }
+
+  /* foreach unique index */
+  template<typename vbool, typename vint, typename Closure>
+  __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure)
+  {
+    vbool valid1 = valid0;
+    while (any(valid1)) {
+      const int j = int(bsf(movemask(valid1)));
+      const int i = vi[j];
+      const vbool valid2 = valid1 & (i == vi);
+      valid1 = andn(valid1, valid2);
+      closure(valid2, i, j);
+    }
+  }
+
+  /* returns the index of the next unique value i in vi and the corresponding valid_i mask */
+  template<typename vbool, typename vint>
+  __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+  {
+    assert(any(valid));
+    const int j = int(bsf(movemask(valid)));
+    const int i = vi[j];
+    valid_i = valid & (i == vi);
+    valid = andn(valid, valid_i);
+    return j;
+  }
+
+  template<typename Closure>
+  __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure)
+  {
+    __aligned(64) int U[2*VSIZEX];
+    __aligned(64) int V[2*VSIZEX];
+    int index = 0;
+    for (int y=y0; y<y1; y++) {
+      const bool lasty = y+1>=y1;
+      const vintx vy = y;
+      for (int x=x0; x<x1; ) { //x+=VSIZEX) {
+        const bool lastx = x+VSIZEX >= x1;
+        vintx vx = x+vintx(step);
+        vintx::storeu(&U[index], vx);
+        vintx::storeu(&V[index], vy);
+        const int dx = min(x1-x,VSIZEX);
+        index += dx;
+        x += dx;
+        if (index >= VSIZEX || (lastx && lasty)) {
+          const vboolx valid = vintx(step) < vintx(index);
+          closure(valid, vintx::load(U), vintx::load(V));
+          x-= max(0, index-VSIZEX);
+          index = 0;
+        }
+      }
+    }
+  }
+}
--- a/thirdparty/embree/common/simd/sse.cpp
+++ b/thirdparty/embree/common/simd/sse.cpp
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sse.h"
+
+namespace embree 
+{
+  const __m128 mm_lookupmask_ps[16] = {
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1))
+  };
+
+  const __m128d mm_lookupmask_pd[4] = {
+    _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)),
+    _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)),
+    _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)),
+    _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1))
+  };
+
+}
--- a/thirdparty/embree/common/simd/sse.h
+++ b/thirdparty/embree/common/simd/sse.h
@@ -0,0 +1,35 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../sys/alloc.h"
+#include "../math/constants.h"
+#include "varying.h"
+
+namespace embree 
+{
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
+    return _mm_blendv_ps(f,t,mask);
+  }
+#else
+  __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
+    return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); 
+  }
+#endif
+
+  extern const __m128  mm_lookupmask_ps[16];
+  extern const __m128d mm_lookupmask_pd[4];
+}
+
+#if defined(__AVX512VL__)
+#include "vboolf4_avx512.h"
+#else
+#include "vboolf4_sse2.h"
+#endif
+#include "vint4_sse2.h"
+#include "vuint4_sse2.h"
+#include "vfloat4_sse2.h"
--- a/thirdparty/embree/common/simd/varying.h
+++ b/thirdparty/embree/common/simd/varying.h
@@ -0,0 +1,145 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+namespace embree
+{
+  /* Varying numeric types */
+  template<int N>
+  struct vfloat_impl
+  {
+    union { float f[N]; int i[N]; };
+    __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < N); return f[index]; }
+  };
+  
+  template<int N>
+  struct vdouble_impl
+  {
+    union { double f[N]; long long i[N]; };
+    __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; }
+    __forceinline       double& operator [](size_t index)       { assert(index < N); return f[index]; }
+  };
+
+  template<int N>
+  struct vint_impl
+  {
+    int i[N];
+    __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+  
+  template<int N>
+  struct vuint_impl
+  {
+    unsigned int i[N];
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  template<int N>
+  struct vllong_impl
+  {
+    long long i[N];
+    __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       long long& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  /* Varying bool types */
+  template<int N> struct vboolf_impl { int       i[N]; }; // for float/int
+  template<int N> struct vboold_impl { long long i[N]; }; // for double/long long
+ 
+  /* Varying size constants */
+#if defined(__AVX512VL__) // SKX
+  const int VSIZEX = 8;  // default size
+  const int VSIZEL = 16; // large size
+#elif defined(__AVX__)
+  const int VSIZEX = 8;
+  const int VSIZEL = 8;
+#else
+  const int VSIZEX = 4;
+  const int VSIZEL = 4;
+#endif
+
+  template<int N>
+  struct vtypes {
+    using vbool = vboolf_impl<N>;
+    using vboolf = vboolf_impl<N>;
+    using vboold = vboold_impl<N>;
+    using vint = vint_impl<N>;
+    using vuint = vuint_impl<N>;
+    using vllong = vllong_impl<N>;
+    using vfloat = vfloat_impl<N>;
+    using vdouble = vdouble_impl<N>;
+  };
+
+  template<>
+  struct vtypes<1> {
+    using vbool = bool;
+    using vboolf = bool;
+    using vboold = bool;
+    using vint = int;
+    using vuint = unsigned int;
+    using vllong = long long;
+    using vfloat = float;
+    using vdouble = double;
+  };
+
+  /* Aliases to default types */
+  template<int N> using vbool = typename vtypes<N>::vbool;
+  template<int N> using vboolf = typename vtypes<N>::vboolf;
+  template<int N> using vboold = typename vtypes<N>::vboold;
+  template<int N> using vint = typename vtypes<N>::vint;
+  template<int N> using vuint = typename vtypes<N>::vuint;
+  template<int N> using vllong = typename vtypes<N>::vllong;
+  template<int N> using vreal = typename vtypes<N>::vfloat;
+  template<int N> using vfloat = typename vtypes<N>::vfloat;
+  template<int N> using vdouble = typename vtypes<N>::vdouble;
+
+  /* 4-wide shortcuts */
+  typedef vfloat<4>  vfloat4;
+  typedef vdouble<4> vdouble4;
+  typedef vreal<4>   vreal4;
+  typedef vint<4>    vint4;
+  typedef vuint<4>  vuint4;
+  typedef vllong<4>  vllong4;
+  typedef vbool<4>   vbool4;
+  typedef vboolf<4>  vboolf4;
+  typedef vboold<4>  vboold4;
+
+  /* 8-wide shortcuts */
+  typedef vfloat<8>  vfloat8;
+  typedef vdouble<8> vdouble8;
+  typedef vreal<8>   vreal8;
+  typedef vint<8>    vint8;
+  typedef vuint<8>    vuint8;
+  typedef vllong<8>  vllong8;
+  typedef vbool<8>   vbool8;
+  typedef vboolf<8>  vboolf8;
+  typedef vboold<8>  vboold8;
+
+  /* 16-wide shortcuts */
+  typedef vfloat<16>  vfloat16;
+  typedef vdouble<16> vdouble16;
+  typedef vreal<16>   vreal16;
+  typedef vint<16>    vint16;
+  typedef vuint<16>   vuint16;
+  typedef vllong<16>  vllong16;
+  typedef vbool<16>   vbool16;
+  typedef vboolf<16>  vboolf16;
+  typedef vboold<16>  vboold16;
+
+  /* Default shortcuts */
+  typedef vfloat<VSIZEX>  vfloatx;
+  typedef vdouble<VSIZEX> vdoublex;
+  typedef vreal<VSIZEX>   vrealx;
+  typedef vint<VSIZEX>    vintx;
+  typedef vuint<VSIZEX>   vuintx;
+  typedef vllong<VSIZEX>  vllongx;
+  typedef vbool<VSIZEX>   vboolx;
+  typedef vboolf<VSIZEX>  vboolfx;
+  typedef vboold<VSIZEX>  vbooldx;
+}
--- a/thirdparty/embree/common/simd/vboold4_avx.h
+++ b/thirdparty/embree/common/simd/vboold4_avx.h
@@ -0,0 +1,174 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide AVX bool type for 64bit data types*/
+  template<>
+  struct vboold<4>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboold4 Bool;
+
+    enum  { size = 4 };       // number of SIMD elements
+    union {                   // data
+      __m256d v;
+      struct { __m128d vl,vh; };
+      long long i[4];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold4& a) { v = a.v; }
+    __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; }
+
+    __forceinline vboold(__m256d a) : v(a) {}
+    __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {}
+
+    __forceinline operator const __m256() const { return _mm256_castpd_ps(v); }
+    __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); }
+    __forceinline operator const __m256d() const { return v; }
+
+    __forceinline vboold(int a)
+    {
+      assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+      const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1);
+      const __m256i b = _mm256_set1_epi64x(a);
+      const __m256i c = _mm256_and_si256(b,mask);
+      v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask));
+#else
+      vl = mm_lookupmask_pd[a & 0x3];
+      vh = mm_lookupmask_pd[a >> 2];
+#endif
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
+#if !defined(__aarch64__)
+    __forceinline vboold(TrueTy)  : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
+#else
+    __forceinline vboold(TrueTy)  : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool       operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; }
+    __forceinline long long& operator [](size_t index)       { assert(index < 4); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); }
+
+  __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+  __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+  __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); }
+
+  __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) {
+    return _mm256_blendv_pd(f, t, mask); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__aarch64__)
+  __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
+  __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
+#endif
+
+#if defined(__AVX2__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboold4 shuffle(const vboold4& v) {
+    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i>
+  __forceinline vboold4 shuffle(const vboold4& v) {
+    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i));
+  }
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+  __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+
+  __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+  __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+  __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; }
+
+  __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+  __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+  __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); }
+  __forceinline size_t       popcnt  (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold4& a, size_t index) { return a[index]; }
+  __forceinline void set  (vboold4& a, size_t index)     { a[index] = -1; }
+  __forceinline void clear(vboold4& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+                       << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vboold4_avx512.h
+++ b/thirdparty/embree/common/simd/vboold4_avx512.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide AVX-512 bool type */
+  template<>
+  struct vboold<4>
+  {
+    typedef vboold4 Bool;
+    typedef vint4   Int;
+
+    enum { size = 4 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold4& t) { v = t.v; }
+    __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; }
+
+    __forceinline vboold(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboold(bool b) { v = b ? 0xf : 0x0; }
+    __forceinline vboold(int t)  { v = (__mmask8)t; }
+    __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const {
+      return _mm_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m256i mask64() const {
+      return _mm256_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(0x0) {}
+    __forceinline vboold(TrueTy)  : v(0xf) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+  __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+  __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+  __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboold4& a) { return a.v == 0xf; }
+  __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+  __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+  __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboold4& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboold4& a, size_t index)       { assert(index < 4); a |= 1 << index; }
+  __forceinline void clear(vboold4& a, size_t index)     { assert(index < 4); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<4; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vboold8_avx512.h
+++ b/thirdparty/embree/common/simd/vboold8_avx512.h
@@ -0,0 +1,151 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX-512 bool type */
+  template<>
+  struct vboold<8>
+  {
+    typedef vboold8 Bool;
+    typedef vint8   Int;
+
+    enum { size = 8 }; // number of SIMD elements
+    __mmask8 v;        // data
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold8& t) { v = t.v; }
+    __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; }
+
+    __forceinline vboold(const __mmask8& t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+    
+    __forceinline vboold(bool b) { v = b ? 0xff : 0x00; }
+    __forceinline vboold(int t)  { v = (__mmask8)t; }
+    __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m512i mask64() const { 
+      return _mm512_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(0x00) {}
+    __forceinline vboold(TrueTy)  : v(0xff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); }
+  __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); }
+  __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; }
+  __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; }
+  __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); }
+  
+  __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline int all (const vboold8& a) { return a.v == 0xff; }
+  __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); }
+  __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); }
+  __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboold8& a) { return popcnt(a.v); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboold8& a, size_t index)       { assert(index < 8); a |= 1 << index; }
+  __forceinline void clear(vboold8& a, size_t index)     { assert(index < 8); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<8; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vboolf16_avx512.h
+++ b/thirdparty/embree/common/simd/vboolf16_avx512.h
@@ -0,0 +1,153 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 16-wide AVX-512 bool type */
+  template<>
+  struct vboolf<16>
+  {
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum { size = 16 }; // number of SIMD elements
+    __mmask16 v;        // data
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf16& t) { v = t.v; }
+    __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask16& t) { v = t; }
+    __forceinline operator __mmask16() const { return v; }
+    
+    __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; }
+    __forceinline vboolf(int t) { v = (__mmask16)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask16)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m512i mask32() const {
+      return _mm512_movm_epi32(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x0000) {}
+    __forceinline vboolf(TrueTy)  : v(0xffff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+  
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 16); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); }
+  
+   ////////////////////////////////////////////////////////////////////////////////
+   /// Binary Operators
+   ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); }
+  __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); }
+  __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); }
+
+  __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; }
+  __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; }
+  __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); }
+  
+  __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) {
+    return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline int all (const vboolf16& a) { return  _mm512_kortestc(a,a) != 0; }
+  __forceinline int any (const vboolf16& a) { return  _mm512_kortestz(a,a) == 0; }
+  __forceinline int none(const vboolf16& a) { return  _mm512_kortestz(a,a) != 0; }
+
+  __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); }
+  __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf16& a) { return popcnt(a.v); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); }
+  __forceinline vboolf16     toMask(const int& a)      { return mm512_int2mask(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf16& a, size_t index)       { assert(index < 16); a |= 1 << index; }
+  __forceinline void clear(vboolf16& a, size_t index)     { assert(index < 16); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<16; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vboolf4_avx512.h
+++ b/thirdparty/embree/common/simd/vboolf4_avx512.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide AVX-512 bool type */
+  template<>
+  struct vboolf<4>
+  {
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+
+    enum { size = 4 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf4& t) { v = t.v; }
+    __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; }
+    __forceinline vboolf(int t)  { v = (__mmask8)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+    __forceinline vboolf(bool a, bool b, bool c, bool d)
+      : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const {
+      return _mm_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m256i mask64() const {
+      return _mm256_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x0) {}
+    __forceinline vboolf(TrueTy)  : v(0xf) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+  __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+  __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+  __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboolf4& a) { return a.v == 0xf; }
+  __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+  __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf4& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf4& a, size_t index)       { assert(index < 4); a |= 1 << index; }
+  __forceinline void clear(vboolf4& a, size_t index)     { assert(index < 4); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<4; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vboolf4_sse2.h
+++ b/thirdparty/embree/common/simd/vboolf4_sse2.h
@@ -0,0 +1,205 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE bool type */
+  template<>
+  struct vboolf<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };            // number of SIMD elements
+    union { __m128 v; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf4& other) { v = other.v; }
+    __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; }
+
+    __forceinline vboolf(__m128 input) : v(input) {}
+    __forceinline operator const __m128&() const { return v; }
+    #if !defined(__EMSCRIPTEN__)
+    __forceinline operator const __m128i() const { return _mm_castps_si128(v); }
+    __forceinline operator const __m128d() const { return _mm_castps_pd(v); }
+    #endif
+
+    __forceinline vboolf(bool a)
+      : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+    __forceinline vboolf(bool a, bool b)
+      : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d)
+      : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+    __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; }
+    __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const { 
+      return _mm_castps_si128(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {}
+    __forceinline vboolf(TrueTy)  : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; }
+    __forceinline int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+  __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+  __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  
+  __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    return _mm_blendv_ps(f, t, m); 
+#else
+    return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
+  __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
+
+#if defined(__aarch64__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32((int32x4_t)v.v), _MN_SHUFFLE(i0, i1, i2, i3)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+    return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+  }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+#endif
+
+  template<int i0>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return shuffle<i0,i0,i0,i0>(v);
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); }
+  template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); }
+  template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
+#endif
+
+#if defined(__SSE4_1__) && !defined(__aarch64__)
+  template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+  template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
+  template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; }
+  __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; }
+
+  __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; }
+  __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; }
+  __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; }
+
+  __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+  __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
+#if defined(__aarch64__)
+  __forceinline size_t popcnt(const vboolf4& a) { return vaddvq_s32(vandq_u32(vreinterpretq_u32_f32(a.v),_mm_set1_epi32(1))); }
+#elif defined(__SSE4_2__)
+  __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
+#else
+  __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; }
+  __forceinline void set(vboolf4& a, size_t index)       { a[index] = -1; }
+  __forceinline void clear(vboolf4& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vboolf8_avx.h
+++ b/thirdparty/embree/common/simd/vboolf8_avx.h
@@ -0,0 +1,202 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX bool type */
+  template<>
+  struct vboolf<8>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };       // number of SIMD elements
+    union {                   // data
+      __m256 v;
+      struct { __m128 vl,vh; };
+      int i[8];
+    };  
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf8& a) { v = a.v; }
+    __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; }
+
+    __forceinline vboolf(__m256 a) : v(a) {}
+    __forceinline operator const __m256&() const { return v; }
+    __forceinline operator const __m256i() const { return _mm256_castps_si256(v); }
+    __forceinline operator const __m256d() const { return _mm256_castps_pd(v); }
+
+    __forceinline vboolf(int a)
+    {
+      assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+      const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1);
+      const __m256i b = _mm256_set1_epi32(a);
+      const __m256i c = _mm256_and_si256(b,mask);
+      v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask));
+#else
+      vl = mm_lookupmask_ps[a & 0xF];
+      vh = mm_lookupmask_ps[a >> 4];
+#endif
+    }
+
+    __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+    __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+    __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {}
+
+    __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {}
+    __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {}
+
+    /* return int32 mask */
+    __forceinline __m256i mask32() const { 
+      return _mm256_castps_si256(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
+    __forceinline vboolf(TrueTy)  : v(_mm256_castsi256_ps(_mm256_set1_epi32(0xFFFFFFFF))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; }
+    __forceinline int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); }
+
+  __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+  __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+  __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); }
+
+  __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) {
+    return _mm256_blendv_ps(f, t, mask); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); }
+  __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vboolf8 shuffle(const vboolf8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1>
+  __forceinline vboolf8 shuffle4(const vboolf8& v) {
+    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) {
+    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf8 shuffle(const vboolf8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) {
+    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); }
+  template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); }
+  template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+
+  template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); }
+  template<int i> __forceinline vboolf4 extract4   (const vboolf8& a) { return _mm256_extractf128_ps(a, i); }
+  template<>      __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a);   }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+  __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+
+  __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+  __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+  __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; }
+
+  __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+  __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+  __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); }
+  __forceinline size_t       popcnt  (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; }
+  __forceinline void set(vboolf8& a, size_t index)       { a[index] = -1; }
+  __forceinline void clear(vboolf8& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+                       << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vboolf8_avx512.h
+++ b/thirdparty/embree/common/simd/vboolf8_avx512.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX-512 bool type */
+  template<>
+  struct vboolf<8>
+  {
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+
+    enum { size = 8 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf8& t) { v = t.v; }
+    __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; }
+    __forceinline vboolf(int t)  { v = (__mmask8)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h)
+      : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m256i mask32() const {
+      return _mm256_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m512i mask64() const {
+      return _mm512_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x00) {}
+    __forceinline vboolf(TrueTy)  : v(0xff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+  __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+  __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); }
+
+  __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboolf8& a) { return a.v == 0xff; }
+  __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+  __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf8& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf8& a, size_t index)       { assert(index < 8); a |= 1 << index; }
+  __forceinline void clear(vboolf8& a, size_t index)     { assert(index < 8); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<8; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vdouble4_avx.h
+++ b/thirdparty/embree/common/simd/vdouble4_avx.h
@@ -0,0 +1,328 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 4-wide AVX 64-bit double type */
+  template<>
+  struct vdouble<4>
+  {
+    ALIGNED_STRUCT_(32);
+            
+    typedef vboold4 Bool;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union {             // data
+      __m256d v; 
+      double i[4]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vdouble() {}
+    __forceinline vdouble(const vdouble4& t) { v = t.v; }
+    __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; }
+
+    __forceinline vdouble(const __m256d& t) { v = t; }
+    __forceinline operator __m256d() const { return v; }
+
+    __forceinline vdouble(double i) {
+      v = _mm256_set1_pd(i);
+    }
+    
+    __forceinline vdouble(double a, double b, double c, double d) {
+      v = _mm256_set_pd(d,c,b,a);      
+    }
+   
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {}
+    __forceinline vdouble(OneTy)  : v(_mm256_set1_pd(1)) {}
+    __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {}
+    __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) {
+      _mm256_stream_pd(ptr, a);
+    }
+
+    static __forceinline vdouble4 loadu(const double* addr) {
+      return _mm256_loadu_pd(addr);
+    }
+
+    static __forceinline vdouble4 load(const vdouble4* addr) {
+      return _mm256_load_pd((double*)addr);
+    }
+
+    static __forceinline vdouble4 load(const double* addr) {
+      return _mm256_load_pd(addr);
+    }
+
+    static __forceinline void store(double* ptr, const vdouble4& v) {
+      _mm256_store_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(double* ptr, const vdouble4& v) {
+      _mm256_storeu_pd(ptr, v);
+    }
+
+    static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       double& operator [](size_t index)       { assert(index < 4); return i[index]; }
+    __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline vdouble4 asDouble(const vllong4&  a) { return _mm256_castsi256_pd(a); }
+  __forceinline vllong4  asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); }
+#endif
+
+  __forceinline vdouble4 operator +(const vdouble4& a) { return a; }
+  __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); }
+  __forceinline vdouble4 operator +(const vdouble4& a, double          b) { return a + vdouble4(b); }
+  __forceinline vdouble4 operator +(double          a, const vdouble4& b) { return vdouble4(a) + b; }
+
+  __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); }
+  __forceinline vdouble4 operator -(const vdouble4& a, double          b) { return a - vdouble4(b); }
+  __forceinline vdouble4 operator -(double          a, const vdouble4& b) { return vdouble4(a) - b; }
+
+  __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); }
+  __forceinline vdouble4 operator *(const vdouble4& a, double          b) { return a * vdouble4(b); }
+  __forceinline vdouble4 operator *(double          a, const vdouble4& b) { return vdouble4(a) * b; }
+
+  __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vdouble4 operator &(const vdouble4& a, double          b) { return a & vdouble4(b); }
+  __forceinline vdouble4 operator &(double          a, const vdouble4& b) { return vdouble4(a) & b; }
+
+  __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); }
+  __forceinline vdouble4 operator |(const vdouble4& a, double          b) { return a | vdouble4(b); }
+  __forceinline vdouble4 operator |(double          a, const vdouble4& b) { return vdouble4(a) | b; }
+
+  __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vdouble4 operator ^(const vdouble4& a, double          b) { return a ^ vdouble4(b); }
+  __forceinline vdouble4 operator ^(double          a, const vdouble4& b) { return vdouble4(a) ^ b; }
+  
+  __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); }
+  __forceinline vdouble4 min(const vdouble4& a, double          b) { return min(a,vdouble4(b)); }
+  __forceinline vdouble4 min(double          a, const vdouble4& b) { return min(vdouble4(a),b); }
+
+  __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); }
+  __forceinline vdouble4 max(const vdouble4& a, double          b) { return max(a,vdouble4(b)); }
+  __forceinline vdouble4 max(double          a, const vdouble4& b) { return max(vdouble4(a),b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FMA__)
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); }
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); }
+#else
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;}
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; }
+  __forceinline vdouble4& operator +=(vdouble4& a, double          b) { return a = a + b; }
+  
+  __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; }
+  __forceinline vdouble4& operator -=(vdouble4& a, double          b) { return a = a - b; }
+
+  __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; }
+  __forceinline vdouble4& operator *=(vdouble4& a, double          b) { return a = a * b; }
+  
+  __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; }
+  __forceinline vdouble4& operator &=(vdouble4& a, double          b) { return a = a & b; }
+  
+  __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; }
+  __forceinline vdouble4& operator |=(vdouble4& a, double          b) { return a = a | b; }
+  
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
+#elif !defined(__aarch64__)
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS);  }
+#else
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b);  }
+#endif
+
+  __forceinline vboold4 operator ==(const vdouble4& a, double          b) { return a == vdouble4(b); }
+  __forceinline vboold4 operator ==(double          a, const vdouble4& b) { return vdouble4(a) == b; }
+
+  __forceinline vboold4 operator !=(const vdouble4& a, double          b) { return a != vdouble4(b); }
+  __forceinline vboold4 operator !=(double          a, const vdouble4& b) { return vdouble4(a) != b; }
+
+  __forceinline vboold4 operator < (const vdouble4& a, double          b) { return a <  vdouble4(b); }
+  __forceinline vboold4 operator < (double          a, const vdouble4& b) { return vdouble4(a) <  b; }
+
+  __forceinline vboold4 operator >=(const vdouble4& a, double          b) { return a >= vdouble4(b); }
+  __forceinline vboold4 operator >=(double          a, const vdouble4& b) { return vdouble4(a) >= b; }
+
+  __forceinline vboold4 operator > (const vdouble4& a, double          b) { return a >  vdouble4(b); }
+  __forceinline vboold4 operator > (double          a, const vdouble4& b) { return vdouble4(a) >  b; }
+
+  __forceinline vboold4 operator <=(const vdouble4& a, double          b) { return a <= vdouble4(b); }
+  __forceinline vboold4 operator <=(double          a, const vdouble4& b) { return vdouble4(a) <= b; }
+
+  __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; }
+  __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; }
+  __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a <  b; }
+  __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; }
+  __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a >  b; }
+  __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <  b); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >  b); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); }
+#endif
+ 
+  __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) {
+#if defined(__AVX512VL__)
+    return _mm256_mask_blend_pd(m, f, t);
+#else
+    return _mm256_blendv_pd(f, t, m);
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vdouble4 shuffle(const vdouble4& v) {
+    return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+  }
+
+  template<int i>
+  __forceinline vdouble4 shuffle(const vdouble4& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1>
+  __forceinline vdouble4 shuffle2(const vdouble4& v) {
+    return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0);
+  }
+
+  __forceinline double toScalar(const vdouble4& v) {
+    return _mm_cvtsd_f64(_mm256_castpd256_pd128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); }
+  __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); }
+
+  __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); }
+  __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); }
+
+  __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+  __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+  __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+  __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); }
+  __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); }
+  __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<4; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vdouble8_avx512.h
+++ b/thirdparty/embree/common/simd/vdouble8_avx512.h
@@ -0,0 +1,351 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX-512 64-bit double type */
+  template<>
+  struct vdouble<8>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboold8 Bool;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {              // data
+      __m512d v;
+      double i[8];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vdouble() {}
+    __forceinline vdouble(const vdouble8& t) { v = t.v; }
+    __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; }
+
+    __forceinline vdouble(const __m512d& t) { v = t; }
+    __forceinline operator __m512d() const { return v; }
+    __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); }
+
+    __forceinline vdouble(double i) {
+      v = _mm512_set1_pd(i);
+    }
+
+    __forceinline vdouble(double a, double b, double c, double d) {
+      v = _mm512_set4_pd(d,c,b,a);
+    }
+
+    __forceinline vdouble(double a0, double a1, double a2, double a3,
+                          double a4, double a5, double a6, double a7)
+    {
+      v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {}
+    __forceinline vdouble(OneTy)  : v(_mm512_set1_pd(1)) {}
+    __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+    __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) {
+      _mm512_stream_pd((double*)ptr, a);
+    }
+
+    static __forceinline vdouble8 loadu(const void* addr) {
+      return _mm512_loadu_pd((double*)addr);
+    }
+
+    static __forceinline vdouble8 load(const vdouble8* addr) {
+      return _mm512_load_pd((double*)addr);
+    }
+
+    static __forceinline vdouble8 load(const double* addr) {
+      return _mm512_load_pd(addr);
+    }
+
+    static __forceinline void store(void* ptr, const vdouble8& v) {
+      _mm512_store_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vdouble8& v) {
+      _mm512_storeu_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) {
+      _mm512_mask_storeu_pd(ptr, mask, f);
+    }
+
+    static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) {
+      _mm512_mask_store_pd(addr, mask, v2);
+    }
+
+    static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) {
+      return _mm512_mask_compress_pd(v, mask, v);
+    }
+
+    static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) {
+      return _mm512_mask_compress_pd(a, mask, b);
+    }
+
+    static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline       double& operator [](size_t index)       { assert(index < 8); return i[index]; }
+    __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 asDouble(const vllong8&  a) { return _mm512_castsi512_pd(a); }
+  __forceinline vllong8  asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); }
+
+  __forceinline vdouble8 operator +(const vdouble8& a) { return a; }
+  __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); }
+  __forceinline vdouble8 operator +(const vdouble8& a, double          b) { return a + vdouble8(b); }
+  __forceinline vdouble8 operator +(double          a, const vdouble8& b) { return vdouble8(a) + b; }
+
+  __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); }
+  __forceinline vdouble8 operator -(const vdouble8& a, double          b) { return a - vdouble8(b); }
+  __forceinline vdouble8 operator -(double          a, const vdouble8& b) { return vdouble8(a) - b; }
+
+  __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); }
+  __forceinline vdouble8 operator *(const vdouble8& a, double          b) { return a * vdouble8(b); }
+  __forceinline vdouble8 operator *(double          a, const vdouble8& b) { return vdouble8(a) * b; }
+
+  __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); }
+  __forceinline vdouble8 operator &(const vdouble8& a, double          b) { return a & vdouble8(b); }
+  __forceinline vdouble8 operator &(double          a, const vdouble8& b) { return vdouble8(a) & b; }
+
+  __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); }
+  __forceinline vdouble8 operator |(const vdouble8& a, double          b) { return a | vdouble8(b); }
+  __forceinline vdouble8 operator |(double          a, const vdouble8& b) { return vdouble8(a) | b; }
+
+  __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); }
+  __forceinline vdouble8 operator ^(const vdouble8& a, double          b) { return a ^ vdouble8(b); }
+  __forceinline vdouble8 operator ^(double          a, const vdouble8& b) { return vdouble8(a) ^ b; }
+
+  __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); }
+
+  __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); }
+
+  __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); }
+
+  __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); }
+  __forceinline vdouble8 min(const vdouble8& a, double          b) { return min(a,vdouble8(b)); }
+  __forceinline vdouble8 min(double          a, const vdouble8& b) { return min(vdouble8(a),b); }
+
+  __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); }
+  __forceinline vdouble8 max(const vdouble8& a, double          b) { return max(a,vdouble8(b)); }
+  __forceinline vdouble8 max(double          a, const vdouble8& b) { return max(vdouble8(a),b); }
+
+  __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); }
+  __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); }
+
+  __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); }
+  __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); }
+  __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); }
+  __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); }
+  __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; }
+  __forceinline vdouble8& operator +=(vdouble8& a, double          b) { return a = a + b; }
+
+  __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; }
+  __forceinline vdouble8& operator -=(vdouble8& a, double          b) { return a = a - b; }
+
+  __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; }
+  __forceinline vdouble8& operator *=(vdouble8& a, double          b) { return a = a * b; }
+
+  __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; }
+  __forceinline vdouble8& operator &=(vdouble8& a, double          b) { return a = a & b; }
+
+  __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; }
+  __forceinline vdouble8& operator |=(vdouble8& a, double          b) { return a = a | b; }
+
+  __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; }
+  __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vdouble8& a, double          b) { return a == vdouble8(b); }
+  __forceinline vboold8 operator ==(double          a, const vdouble8& b) { return vdouble8(a) == b; }
+
+  __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vdouble8& a, double          b) { return a != vdouble8(b); }
+  __forceinline vboold8 operator !=(double          a, const vdouble8& b) { return vdouble8(a) != b; }
+
+  __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vdouble8& a, double          b) { return a <  vdouble8(b); }
+  __forceinline vboold8 operator < (double          a, const vdouble8& b) { return vdouble8(a) <  b; }
+
+  __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vdouble8& a, double          b) { return a >= vdouble8(b); }
+  __forceinline vboold8 operator >=(double          a, const vdouble8& b) { return vdouble8(a) >= b; }
+
+  __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vdouble8& a, double          b) { return a >  vdouble8(b); }
+  __forceinline vboold8 operator > (double          a, const vdouble8& b) { return vdouble8(a) >  b; }
+
+  __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vdouble8& a, double          b) { return a <= vdouble8(b); }
+  __forceinline vboold8 operator <=(double          a, const vdouble8& b) { return vdouble8(a) <= b; }
+
+  __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); }
+
+  __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) {
+    return _mm512_mask_or_pd(f,m,t,t);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+  }
+
+  template<int i>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vdouble8 shuffle4(const vdouble8& v) {
+    return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+  }
+
+  template<int i>
+  __forceinline vdouble8 shuffle4(const vdouble8& v) {
+    return shuffle4<i, i>(v);
+  }
+  
+  template<int i>
+  __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) {
+    return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i));
+  }
+
+  __forceinline double toScalar(const vdouble8& v) {
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 vreduce_add2(vdouble8 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+  __forceinline vdouble8 vreduce_min2(vdouble8 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+  __forceinline vdouble8 vreduce_max2(vdouble8 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+  __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); }
+  __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) {
+    return _mm512_permutexvar_pd(index, v);
+  }
+
+  __forceinline vdouble8 reverse(const vdouble8& a) {
+    return permute(a, vllong8(reverse_step));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<8; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vfloat16_avx512.h
+++ b/thirdparty/embree/common/simd/vfloat16_avx512.h
@@ -0,0 +1,627 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 16-wide AVX-512 float type */
+  template<>
+  struct vfloat<16>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512 v; 
+      float f[16];
+      int i[16];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+        
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat16& t) { v = t; }
+    __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; }
+
+    __forceinline vfloat(const __m512& t) { v = t; }
+    __forceinline operator __m512() const { return v; }
+    __forceinline operator __m256() const { return _mm512_castps512_ps256(v); }
+    __forceinline operator __m128() const { return _mm512_castps512_ps128(v); }
+
+    __forceinline vfloat(float f) {
+      v = _mm512_set1_ps(f);
+    }
+
+    __forceinline vfloat(float a, float b, float c, float d) {
+      v = _mm512_set4_ps(a, b, c, d);
+    }
+
+    __forceinline vfloat(const vfloat4& i) {
+      v = _mm512_broadcast_f32x4(i);
+    }
+
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) {
+      v = _mm512_castps128_ps512(a);
+      v = _mm512_insertf32x4(v, b, 1);
+      v = _mm512_insertf32x4(v, c, 2);
+      v = _mm512_insertf32x4(v, d, 3);
+    }
+
+    __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) {
+      v = _mm512_broadcast_f32x4(a);
+      v = _mm512_mask_broadcast_f32x4(v,mask,b);
+    }
+
+    __forceinline vfloat(const vfloat8& i) {
+      v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i)));
+    }
+
+    __forceinline vfloat(const vfloat8& a, const vfloat8& b) {
+      v = _mm512_castps256_ps512(a);
+#if defined(__AVX512DQ__)
+      v = _mm512_insertf32x8(v, b, 1);
+#else
+      v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1));
+#endif
+    }
+
+    /* WARNING: due to f64x4 the mask is considered as an 8bit mask */
+    /*__forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) {
+      __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a));
+      aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b));
+      v = _mm512_castpd_ps(aa);
+      }*/
+    
+    __forceinline explicit vfloat(const vint16& a) {
+      v = _mm512_cvtepi32_ps(a);
+    }
+
+    __forceinline explicit vfloat(const vuint16& a) {
+      v = _mm512_cvtepu32_ps(a);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm512_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm512_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm512_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr);  }
+    static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); }
+
+    static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); }
+
+    static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); }
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) {
+      _mm512_stream_ps((float*)ptr,a);
+    }
+
+    static __forceinline vfloat16 broadcast(const float* f) {
+      return _mm512_set1_ps(*f);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat16 gather(const float* ptr, const vint16& index) {
+      return _mm512_i32gather_ps(index, ptr, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) {
+      vfloat16 r = zero;
+      return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) {
+      _mm512_i32scatter_ps(ptr, index, v, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) {
+      _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       float& operator [](size_t index)       { assert(index < 16); return f[index]; }
+    __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 asFloat(const vint16&   a) { return _mm512_castsi512_ps(a); }
+  __forceinline vint16   asInt  (const vfloat16& a) { return _mm512_castps_si512(a); }
+  __forceinline vuint16  asUInt (const vfloat16& a) { return _mm512_castps_si512(a); }
+
+  __forceinline vint16   toInt  (const vfloat16& a) { return vint16(a); }
+  __forceinline vfloat16 toFloat(const vint16&   a) { return vfloat16(a); }
+
+  __forceinline vfloat16 operator +(const vfloat16& a) { return a; }
+  __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); }
+
+  __forceinline vfloat16 abs    (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }
+  __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }
+
+  __forceinline vfloat16 rcp(const vfloat16& a)
+  {
+    const vfloat16 r = _mm512_rcp14_ps(a);
+    return _mm512_fmadd_ps(r, _mm512_fnmadd_ps(a, r, vfloat16(1.0)), r);  // computes r + r * (1 - a*r)
+  }
+
+  __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }
+  __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); }
+
+  __forceinline vfloat16 rsqrt(const vfloat16& a)
+  {
+    const vfloat16 r = _mm512_rsqrt14_ps(a);
+    return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r,
+                           _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); }
+  __forceinline vfloat16 operator +(const vfloat16& a, float           b) { return a + vfloat16(b); }
+  __forceinline vfloat16 operator +(float           a, const vfloat16& b) { return vfloat16(a) + b; }
+
+  __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); }
+  __forceinline vfloat16 operator -(const vfloat16& a, float           b) { return a - vfloat16(b); }
+  __forceinline vfloat16 operator -(float           a, const vfloat16& b) { return vfloat16(a) - b; }
+
+  __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); }
+  __forceinline vfloat16 operator *(const vfloat16& a, float           b) { return a * vfloat16(b); }
+  __forceinline vfloat16 operator *(float           a, const vfloat16& b) { return vfloat16(a) * b; }
+
+  __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); }
+  __forceinline vfloat16 operator /(const vfloat16& a, float           b) { return a/vfloat16(b); }
+  __forceinline vfloat16 operator /(float           a, const vfloat16& b) { return vfloat16(a)/b; }
+  
+  __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); }
+  __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); }
+  __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) {
+    return  _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); 
+  }
+  
+  __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { return _mm512_min_ps(a,b);  }
+  __forceinline vfloat16 min(const vfloat16& a, float           b) { return _mm512_min_ps(a,vfloat16(b)); }
+  __forceinline vfloat16 min(const float&    a, const vfloat16& b) { return _mm512_min_ps(vfloat16(a),b); }
+
+  __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { return _mm512_max_ps(a,b); }
+  __forceinline vfloat16 max(const vfloat16& a, float           b) { return _mm512_max_ps(a,vfloat16(b)); }
+  __forceinline vfloat16 max(const float&    a, const vfloat16& b) { return _mm512_max_ps(vfloat16(a),b); }
+
+  __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) {
+    const vint16 ai = _mm512_castps_si512(a);
+    const vint16 bi = _mm512_castps_si512(b);
+    const vint16 ci = _mm512_min_epi32(ai,bi);
+    return _mm512_castsi512_ps(ci);
+  }
+
+  __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) {
+    const vint16 ai = _mm512_castps_si512(a);
+    const vint16 bi = _mm512_castps_si512(b);
+    const vint16 ci = _mm512_max_epi32(ai,bi);
+    return _mm512_castsi512_ps(ci);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); }
+  __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }
+  __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); }
+  __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; }
+  __forceinline vfloat16& operator +=(vfloat16& a, float           b) { return a = a + b; }
+  
+  __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; }
+  __forceinline vfloat16& operator -=(vfloat16& a, float           b) { return a = a - b; }
+  
+  __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; }
+  __forceinline vfloat16& operator *=(vfloat16& a, float           b) { return a = a * b; }
+
+  __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; }
+  __forceinline vfloat16& operator /=(vfloat16& a, float           b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vfloat16& a, float           b) { return a == vfloat16(b); }
+  __forceinline vboolf16 operator ==(float           a, const vfloat16& b) { return vfloat16(a) == b; }
+
+  __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vfloat16& a, float           b) { return a != vfloat16(b); }
+  __forceinline vboolf16 operator !=(float           a, const vfloat16& b) { return vfloat16(a) != b; }
+
+  __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vfloat16& a, float           b) { return a <  vfloat16(b); }
+  __forceinline vboolf16 operator < (float           a, const vfloat16& b) { return vfloat16(a) <  b; }
+
+  __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vfloat16& a, float           b) { return a >= vfloat16(b); }
+  __forceinline vboolf16 operator >=(float           a, const vfloat16& b) { return vfloat16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vfloat16& a, float           b) { return a >  vfloat16(b); }
+  __forceinline vboolf16 operator > (float           a, const vfloat16& b) { return vfloat16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vfloat16& a, float           b) { return a <= vfloat16(b); }
+  __forceinline vboolf16 operator <=(float           a, const vfloat16& b) { return vfloat16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); }
+  
+  __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) {
+    return _mm512_mask_blend_ps(s, f, t);
+  }
+
+  __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) {
+    return madd(t,b-a,a);
+  }
+
+  __forceinline bool isvalid (const vfloat16& v) {
+    return all((v > vfloat16(-FLT_LARGE)) & (v < vfloat16(+FLT_LARGE)));
+  }
+
+  __forceinline void xchg(vboolf16 m, vfloat16& a, vfloat16& b)
+  {
+    vfloat16 c = a;
+    a = select(m,b,a);
+    b = select(m,c,b); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vfloat16 floor(const vfloat16& a) {
+    return _mm512_floor_ps(a);
+  }
+  __forceinline vfloat16 ceil (const vfloat16& a) {
+    return _mm512_ceil_ps(a);
+  }
+  __forceinline vfloat16 round (const vfloat16& a) {
+    return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  }
+  __forceinline vint16 floori (const vfloat16& a) {
+    return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); }
+  __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vfloat16 shuffle(const vfloat16& v) {
+    return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat16 shuffle(const vfloat16& v) {
+    return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i>
+  __forceinline vfloat16 shuffle4(const vfloat16& v) {
+    return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat16 shuffle4(const vfloat16& v) {
+    return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e));
+  }
+
+  __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e));
+  }
+
+  __forceinline vfloat16 permute(vfloat16 v, __m512i index) {
+    return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v)));
+  }
+
+  __forceinline vfloat16 reverse(const vfloat16& v) {
+    return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+  }
+
+  template<int i>
+  __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+  };
+
+  template<int i>
+  __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+  };
+ 
+  __forceinline vfloat16 shift_left_1(const vfloat16& a) {
+    vfloat16 z = zero;
+    return mask_align_shift_right<15>(0xfffe,z,a,a);
+  }
+
+  __forceinline vfloat16 shift_right_1(const vfloat16& x) {
+    return align_shift_right<1>(zero,x);
+  }
+
+  __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); }
+
+
+  template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); }
+
+  template<int N, int i>
+  vfloat<N> extractN(const vfloat16& v);
+
+  template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+  template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); }
+  template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); }
+  template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); }
+
+  template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+  template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); }
+
+  template<int i> __forceinline vfloat4 extract4   (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); }
+  template<>      __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+
+  template<int i> __forceinline vfloat8 extract8   (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); }
+  template<>      __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+  {
+    vfloat16 a0a2_b0b2 = unpacklo(r0, r2);
+    vfloat16 c0c2_d0d2 = unpackhi(r0, r2);
+    vfloat16 a1a3_b1b3 = unpacklo(r1, r3);
+    vfloat16 c1c3_d1d3 = unpackhi(r1, r3);
+
+    c0 = unpacklo(a0a2_b0b2, a1a3_b1b3);
+    c1 = unpackhi(a0a2_b0b2, a1a3_b1b3);
+    c2 = unpacklo(c0c2_d0d2, c1c3_d1d3);
+    c3 = unpackhi(c0c2_d0d2, c1c3_d1d3);
+  }
+
+  __forceinline void transpose(const vfloat4& r0,  const vfloat4& r1,  const vfloat4& r2,  const vfloat4& r3,
+                               const vfloat4& r4,  const vfloat4& r5,  const vfloat4& r6,  const vfloat4& r7,
+                               const vfloat4& r8,  const vfloat4& r9,  const vfloat4& r10, const vfloat4& r11,
+                               const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+  {
+    return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15),
+                     c0, c1, c2, c3);
+  }
+
+  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+                               const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+  {
+    vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3;
+    transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3);
+
+    vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7;
+    transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7);
+
+    c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+    c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+    c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+    c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+    c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+    c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+    c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+    c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+  }
+
+  __forceinline void transpose(const vfloat8& r0,  const vfloat8& r1,  const vfloat8& r2,  const vfloat8& r3,
+                               const vfloat8& r4,  const vfloat8& r5,  const vfloat8& r6,  const vfloat8& r7,
+                               const vfloat8& r8,  const vfloat8& r9,  const vfloat8& r10, const vfloat8& r11,
+                               const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+  {
+    return transpose(vfloat16(r0, r8),  vfloat16(r1, r9),  vfloat16(r2, r10), vfloat16(r3, r11),
+                     vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15),
+                     c0, c1, c2, c3, c4, c5, c6, c7);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 vreduce_add2(vfloat16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+  __forceinline vfloat16 vreduce_min2(vfloat16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vfloat16 vreduce_max2(vfloat16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); }
+  __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); }
+ 
+  __forceinline size_t select_min(const vfloat16& v) { 
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ)));
+  }
+
+  __forceinline size_t select_max(const vfloat16& v) { 
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ)));
+  }
+
+  __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) 
+  { 
+    const vfloat16 a = select(valid,v,vfloat16(pos_inf)); 
+    const vbool16 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+
+  __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) 
+  { 
+    const vfloat16 a = select(valid,v,vfloat16(neg_inf)); 
+    const vbool16 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+  
+  __forceinline vfloat16 prefix_sum(const vfloat16& a) 
+  {
+    const vfloat16 z(zero);
+    vfloat16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) 
+  {
+    const vfloat16 z(zero);
+    vfloat16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  __forceinline vfloat16 prefix_min(const vfloat16& a)
+  {
+    const vfloat16 z(pos_inf);
+    vfloat16 v = a;
+    v = min(v,align_shift_right<16-1>(v,z));
+    v = min(v,align_shift_right<16-2>(v,z));
+    v = min(v,align_shift_right<16-4>(v,z));
+    v = min(v,align_shift_right<16-8>(v,z));
+    return v;  
+  }
+
+  __forceinline vfloat16 prefix_max(const vfloat16& a)
+  {
+    const vfloat16 z(neg_inf);
+    vfloat16 v = a;
+    v = max(v,align_shift_right<16-1>(v,z));
+    v = max(v,align_shift_right<16-2>(v,z));
+    v = max(v,align_shift_right<16-4>(v,z));
+    v = max(v,align_shift_right<16-8>(v,z));
+    return v;  
+  }
+
+
+  __forceinline vfloat16 reverse_prefix_min(const vfloat16& a)
+  {
+    const vfloat16 z(pos_inf);
+    vfloat16 v = a;
+    v = min(v,align_shift_right<1>(z,v));
+    v = min(v,align_shift_right<2>(z,v));
+    v = min(v,align_shift_right<4>(z,v));
+    v = min(v,align_shift_right<8>(z,v));
+    return v;  
+  }
+
+  __forceinline vfloat16 reverse_prefix_max(const vfloat16& a)
+  {
+    const vfloat16 z(neg_inf);
+    vfloat16 v = a;
+    v = max(v,align_shift_right<1>(z,v));
+    v = max(v,align_shift_right<2>(z,v));
+    v = max(v,align_shift_right<4>(z,v));
+    v = max(v,align_shift_right<8>(z,v));
+    return v;  
+  }
+
+  __forceinline vfloat16 rcp_safe(const vfloat16& a) {
+    return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vfloat4_sse2.h
+++ b/thirdparty/embree/common/simd/vfloat4_sse2.h
@@ -0,0 +1,805 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE float type */
+  template<>
+  struct vfloat<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+    
+    enum  { size = 4 };                        // number of SIMD elements
+    union { __m128 v; float f[4]; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat4& other) { v = other.v; }
+    //__forceinline vfloat(const vfloat4& other) = default;
+    
+    __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; }
+
+    __forceinline vfloat(__m128 a) : v(a) {}
+    __forceinline operator const __m128&() const { return v; }
+    __forceinline operator       __m128&()       { return v; }
+
+    __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {}
+    __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
+
+    __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
+#if defined(__aarch64__)
+    __forceinline explicit vfloat(const vuint4& x) {
+        v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v));
+    }
+#else
+    __forceinline explicit vfloat(const vuint4& x) {
+      const __m128i a   = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
+      const __m128i b   = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 
+      const __m128  af  = _mm_cvtepi32_ps(a);
+      const __m128  bf  = _mm_castsi128_ps(b);  
+      v  = _mm_add_ps(af,bf);
+    }
+#endif
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); }
+    static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); }
+
+    static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+#else
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); }
+#endif
+
+#if defined(__AVX__)
+    static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); }
+#else
+    static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); }
+#endif
+
+    static __forceinline vfloat4 load_nt (const float* ptr) {
+#if defined (__SSE4_1__)
+    return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr));
+#else
+    return _mm_load_ps(ptr); 
+#endif
+  }
+
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const char* ptr) {
+        return __m128(_mm_load4epi8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const char* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const char* ptr) {
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const unsigned char* ptr) {
+        return __m128(_mm_load4epu8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const unsigned char* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const unsigned char* ptr) {
+      //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const short* ptr) {
+        return __m128(_mm_load4epi16_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const short* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const short* ptr) {
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+    static __forceinline vfloat4 load(const unsigned short* ptr) {
+      return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f));
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vfloat4& v)
+    {
+#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+      _mm_stream_ps((float*)ptr,v);
+#else
+      _mm_stream_ps((float*)ptr,v);
+#endif
+#else
+      _mm_store_ps((float*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_i32gather_ps(ptr, index, scale);
+#else
+      return vfloat4(
+        *(float*)(((char*)ptr)+scale*index[0]),
+        *(float*)(((char*)ptr)+scale*index[1]),
+        *(float*)(((char*)ptr)+scale*index[2]),
+        *(float*)(((char*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) {
+      vfloat4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)  && !defined(__aarch64__)
+      return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_i32scatter_ps((float*)ptr, index, v, scale);
+#else
+      *(float*)(((char*)ptr)+scale*index[0]) = v[0];
+      *(float*)(((char*)ptr)+scale*index[1]) = v[1];
+      *(float*)(((char*)ptr)+scale*index[2]) = v[2];
+      *(float*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale);
+#else
+      if (likely(mask[0])) *(float*)(((char*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((char*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((char*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    static __forceinline void store(const vboolf4& mask, char* ptr, const vint4& ofs, const vfloat4& v) {
+      scatter<1>(mask,ptr,ofs,v);
+    }
+    static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) {
+      scatter<4>(mask,ptr,ofs,v);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < 4); return f[index]; }
+
+    friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_ps(m, f, t);
+#elif defined(__SSE4_1__) || (defined(__aarch64__))
+      return _mm_blendv_ps(f, t, m);
+#else
+      return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Load/Store
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<> struct mem<vfloat4>
+  {
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return vfloat4::load (mask,ptr); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return vfloat4::loadu(mask,ptr); }
+    
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::store (mask,ptr,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::storeu(mask,ptr,v); }
+  };
+    
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 asFloat(const vint4&   a) { return _mm_castsi128_ps(a); }
+  __forceinline vint4   asInt  (const vfloat4& a) { return _mm_castps_si128(a); }
+  __forceinline vuint4  asUInt (const vfloat4& a) { return _mm_castps_si128(a); }
+
+  __forceinline vint4   toInt  (const vfloat4& a) { return vint4(a); }
+  __forceinline vfloat4 toFloat(const vint4&   a) { return vfloat4(a); }
+
+  __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
+#if defined(__aarch64__)
+  __forceinline vfloat4 operator -(const vfloat4& a) {
+    return vnegq_f32(a);
+  }
+#else
+  __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+#endif
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); }
+#else
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+#endif
+
+#if defined(__AVX512VL__)
+  __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
+#else
+  __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
+#endif
+
+  __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+
+  __forceinline vfloat4 rcp(const vfloat4& a)
+  {
+#if defined(__aarch64__)
+    return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v));
+#else
+
+#if defined(__AVX512VL__)
+    const vfloat4 r = _mm_rcp14_ps(a);
+#else
+    const vfloat4 r = _mm_rcp_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_fmadd_ps(r, _mm_fnmadd_ps(a, r, vfloat4(1.0f)), r);                    // computes r + r * (1 - a * r)
+#else
+    return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r))));  // computes r + r * (1 - a * r)
+#endif
+
+#endif  //defined(__aarch64__)
+  }
+  __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
+  __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
+
+  __forceinline vfloat4 rsqrt(const vfloat4& a)
+  {
+#if defined(__aarch64__)
+    vfloat4 r = _mm_rsqrt_ps(a);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    return r;
+#else
+
+#if defined(__AVX512VL__)
+    vfloat4 r = _mm_rsqrt14_ps(a);
+#else
+    vfloat4 r = _mm_rsqrt_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#else
+    r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
+
+#endif
+    return r;
+  }
+
+  __forceinline vboolf4 isnan(const vfloat4& a) {
+    const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
+#if defined(__AVX512VL__)
+    return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
+#else
+    return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000)));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); }
+  __forceinline vfloat4 operator +(const vfloat4& a, float          b) { return a + vfloat4(b); }
+  __forceinline vfloat4 operator +(float          a, const vfloat4& b) { return vfloat4(a) + b; }
+
+  __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); }
+  __forceinline vfloat4 operator -(const vfloat4& a, float          b) { return a - vfloat4(b); }
+  __forceinline vfloat4 operator -(float          a, const vfloat4& b) { return vfloat4(a) - b; }
+
+  __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); }
+  __forceinline vfloat4 operator *(const vfloat4& a, float          b) { return a * vfloat4(b); }
+  __forceinline vfloat4 operator *(float          a, const vfloat4& b) { return vfloat4(a) * b; }
+
+  __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); }
+  __forceinline vfloat4 operator /(const vfloat4& a, float          b) { return a/vfloat4(b); }
+  __forceinline vfloat4 operator /(float          a, const vfloat4& b) { return vfloat4(a)/b; }
+
+  __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); }
+  __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vint4&   b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); }
+
+  __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); }
+  __forceinline vfloat4 min(const vfloat4& a, float          b) { return _mm_min_ps(a,vfloat4(b)); }
+  __forceinline vfloat4 min(float          a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); }
+
+  __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); }
+  __forceinline vfloat4 max(const vfloat4& a, float          b) { return _mm_max_ps(a,vfloat4(b)); }
+  __forceinline vfloat4 max(float          a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
+
+#if defined(__SSE4_1__) || defined(__aarch64__)
+
+    __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epu32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epu32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#else
+    __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+      return min(a,b);
+    }
+
+    __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+      return max(a,b);
+    }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; }
+  __forceinline vfloat4& operator +=(vfloat4& a, float          b) { return a = a + b; }
+
+  __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; }
+  __forceinline vfloat4& operator -=(vfloat4& a, float          b) { return a = a - b; }
+
+  __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; }
+  __forceinline vfloat4& operator *=(vfloat4& a, float          b) { return a = a * b; }
+
+  __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; }
+  __forceinline vfloat4& operator /=(vfloat4& a, float          b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
+#if defined(__aarch64__)
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); }
+#else
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
+#endif
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vfloat4& a, float          b) { return a == vfloat4(b); }
+  __forceinline vboolf4 operator ==(float          a, const vfloat4& b) { return vfloat4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vfloat4& a, float          b) { return a != vfloat4(b); }
+  __forceinline vboolf4 operator !=(float          a, const vfloat4& b) { return vfloat4(a) != b; }
+
+  __forceinline vboolf4 operator < (const vfloat4& a, float          b) { return a <  vfloat4(b); }
+  __forceinline vboolf4 operator < (float          a, const vfloat4& b) { return vfloat4(a) <  b; }
+  
+  __forceinline vboolf4 operator >=(const vfloat4& a, float          b) { return a >= vfloat4(b); }
+  __forceinline vboolf4 operator >=(float          a, const vfloat4& b) { return vfloat4(a) >= b; }
+
+  __forceinline vboolf4 operator > (const vfloat4& a, float          b) { return a >  vfloat4(b); }
+  __forceinline vboolf4 operator > (float          a, const vfloat4& b) { return vfloat4(a) >  b; }
+
+  __forceinline vboolf4 operator <=(const vfloat4& a, float          b) { return a <= vfloat4(b); }
+  __forceinline vboolf4 operator <=(float          a, const vfloat4& b) { return vfloat4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; }
+  __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a <  b; }
+  __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; }
+  __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a >  b; }
+  __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <  b); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >  b); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+    __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f)
+  {
+#if defined(__SSE4_1__) 
+    return _mm_blend_ps(f, t, mask);
+#else
+    return select(vboolf4(mask), t, f);
+#endif
+  }
+
+  __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
+    return madd(t,b-a,a);
+  }
+  
+  __forceinline bool isvalid(const vfloat4& v) {
+    return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite(const vfloat4& a) {
+    return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+  }
+
+  __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) {
+    return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0
+  __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn?
+#elif defined (__SSE4_1__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO       ); }
+  __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#else
+  __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); }
+  __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); }
+#endif
+  __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
+
+  __forceinline vint4 floori(const vfloat4& a) {
+#if defined(__aarch64__)
+    return vcvtq_s32_f32(floor(a));
+#elif defined(__SSE4_1__)
+    return vint4(floor(a));
+#else
+    return vint4(a-vfloat4(0.5f));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
+  __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
+
+#if defined(__aarch64__)
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& v) {
+          return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+      }
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+          return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+      }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat4 shuffle(const vfloat4& v) {
+    return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+#endif
+
+#if defined(__SSE3__) && !defined(__aarch64__)
+  template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
+  template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
+  template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vfloat4 shuffle(const vfloat4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__aarch64__)
+  template<int i> __forceinline float extract(const vfloat4& a) { return a[i]; }
+#else
+  template<int i> __forceinline float extract   (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); }
+  template<>      __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+#endif
+
+#if defined (__SSE4_1__) && !defined(__aarch64__)
+  template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+  template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
+  template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
+#else
+  template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; }
+  template<int dst>  __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; }
+#endif
+
+  __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); }
+
+  __forceinline vfloat4 shift_right_1(const vfloat4& x) {
+    return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); 
+  }
+
+#if defined (__AVX2__)
+  __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) {
+    return _mm_permutevar_ps(a,index);
+  }
+
+  __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); }
+
+#endif
+
+#if defined(__AVX512VL__)
+  template<int i>
+  __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) {
+    return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i));
+  }  
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting Network
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 sort_ascending(const vfloat4& v)
+  {
+    const vfloat4 a0 = v;
+    const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat4 c0 = min(a0,b0);
+    const vfloat4 d0 = max(a0,b0);
+    const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat4 c1 = min(a1,b1);
+    const vfloat4 d1 = max(a1,b1);
+    const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+    const vfloat4 c2 = min(a2,b2);
+    const vfloat4 d2 = max(a2,b2);
+    const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  __forceinline vfloat4 sort_descending(const vfloat4& v)
+  {
+    const vfloat4 a0 = v;
+    const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat4 c0 = max(a0,b0);
+    const vfloat4 d0 = min(a0,b0);
+    const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat4 c1 = max(a1,b1);
+    const vfloat4 d1 = min(a1,b1);
+    const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+    const vfloat4 c2 = max(a2,b2);
+    const vfloat4 d2 = min(a2,b2);
+    const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3)
+  {
+    vfloat4 l02 = unpacklo(r0,r2);
+    vfloat4 h02 = unpackhi(r0,r2);
+    vfloat4 l13 = unpacklo(r1,r3);
+    vfloat4 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+    c3 = unpackhi(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2)
+  {
+    vfloat4 l02 = unpacklo(r0,r2);
+    vfloat4 h02 = unpackhi(r0,r2);
+    vfloat4 l13 = unpacklo(r1,r3);
+    vfloat4 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+  __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); }
+  __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); }
+  __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); }
+#else
+  __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+#endif
+
+#if defined(__aarch64__)
+  __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); }
+  __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); }
+  __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); }
+#else
+  __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
+  __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+#endif
+
+  __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) 
+  { 
+    const vfloat4 a = select(valid,v,vfloat4(pos_inf)); 
+    const vbool4 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+  __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) 
+  { 
+    const vfloat4 a = select(valid,v,vfloat4(neg_inf)); 
+    const vbool4 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidean Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot(const vfloat4& a, const vfloat4& b) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b)
+  {
+    const vfloat4 a0 = a;
+    const vfloat4 b0 = shuffle<1,2,0,3>(b);
+    const vfloat4 a1 = shuffle<1,2,0,3>(a);
+    const vfloat4 b1 = b;
+    return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vfloat8_avx.h
+++ b/thirdparty/embree/common/simd/vfloat8_avx.h
@@ -0,0 +1,821 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX float type */
+  template<>
+  struct vfloat<8>
+  {
+    ALIGNED_STRUCT_(32);
+   
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };                        // number of SIMD elements
+    union { __m256 v; float f[8]; int i[8]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat8& other) { v = other.v; }
+    __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; }
+
+    __forceinline vfloat(__m256 a) : v(a) {}
+    __forceinline operator const __m256&() const { return v; }
+    __forceinline operator       __m256&()       { return v; }
+
+    __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+
+    __forceinline explicit vfloat(const char* a) : v(_mm256_loadu_ps((const float*)a)) {}
+    __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {}
+    __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {}
+    __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {}
+    __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm256_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm256_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm256_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat8 broadcast(const void* a) {
+      return _mm256_broadcast_ss((float*)a); 
+    }
+
+    static __forceinline vfloat8 load(const char* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+
+    static __forceinline vfloat8 load(const unsigned char* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+
+    static __forceinline vfloat8 load(const short* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+      
+    static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); }
+    static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); }
+
+    static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
+#else
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,_mm256_castps_si256(mask.v)); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,_mm256_castps_si256(mask.v)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v); }
+#endif
+    
+#if defined(__AVX2__)
+    static __forceinline vfloat8 load_nt(void* ptr) {
+      return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr));
+    }
+#endif
+    
+    static __forceinline void store_nt(void* ptr, const vfloat8& v) {
+      _mm256_stream_ps((float*)ptr,v);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm256_i32gather_ps(ptr, index ,scale);
+#else
+      return vfloat8(
+          *(float*)(((char*)ptr)+scale*index[0]),
+          *(float*)(((char*)ptr)+scale*index[1]),
+          *(float*)(((char*)ptr)+scale*index[2]),
+          *(float*)(((char*)ptr)+scale*index[3]),
+          *(float*)(((char*)ptr)+scale*index[4]),
+          *(float*)(((char*)ptr)+scale*index[5]),
+          *(float*)(((char*)ptr)+scale*index[6]),
+          *(float*)(((char*)ptr)+scale*index[7]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) {
+      vfloat8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__)
+      return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(float*)(((char*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(float*)(((char*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(float*)(((char*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(float*)(((char*)ptr)+scale*index[7]);
+      return r;
+    #endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_ps((float*)ptr, ofs, v, scale);
+#else
+      *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < 8); return f[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 asFloat(const vint8&   a) { return _mm256_castsi256_ps(a); }
+  __forceinline vint8   asInt  (const vfloat8& a) { return _mm256_castps_si256(a); }
+
+  __forceinline vint8   toInt  (const vfloat8& a) { return vint8(a); }
+  __forceinline vfloat8 toFloat(const vint8&   a) { return vfloat8(a); }
+
+  __forceinline vfloat8 operator +(const vfloat8& a) { return a; }
+#if !defined(__aarch64__)
+  __forceinline vfloat8 operator -(const vfloat8& a) {
+    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); 
+    return _mm256_xor_ps(a, mask);
+  }
+#else
+  __forceinline vfloat8 operator -(const vfloat8& a) {
+      __m256 res;
+      res.lo = vnegq_f32(a.v.lo);
+      res.hi = vnegq_f32(a.v.hi);
+      return res;
+}
+#endif
+
+#if !defined(__aarch64__)
+  __forceinline vfloat8 abs(const vfloat8& a) {
+    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
+    return _mm256_and_ps(a, mask);
+  }
+#else
+__forceinline vfloat8 abs(const vfloat8& a) {
+    __m256 res;
+    res.lo = vabsq_f32(a.v.lo);
+    res.hi = vabsq_f32(a.v.hi);
+    return res;
+}
+#endif
+
+#if !defined(__aarch64__)
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
+#else
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); }
+#endif
+  __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
+
+
+  static __forceinline vfloat8 rcp(const vfloat8& a)
+  {
+#if defined(__aarch64__)
+    vfloat8 ret;
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    ret.v.lo = vdivq_f32(one, a.v.lo);
+    ret.v.hi = vdivq_f32(one, a.v.hi);
+    return ret;
+#endif
+
+#if defined(__AVX512VL__)
+    const vfloat8 r = _mm256_rcp14_ps(a);
+#else
+    const vfloat8 r = _mm256_rcp_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    // First, compute 1 - a * r (which will be very close to 0)
+    const vfloat8 h_n = _mm256_fnmadd_ps(a, r, vfloat8(1.0f));
+    // Then compute r + r * h_n
+    return _mm256_fmadd_ps(r, h_n, r);
+#else
+    return _mm256_add_ps(r,_mm256_mul_ps(r, _mm256_sub_ps(vfloat8(1.0f), _mm256_mul_ps(a, r))));  // computes r + r * (1 - a * r)
+#endif
+  }
+  __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); }
+  __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); }
+
+  static __forceinline vfloat8 rsqrt(const vfloat8& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat8 r = _mm256_rsqrt14_ps(a);
+#else
+    const vfloat8 r = _mm256_rsqrt_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r,
+                           _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); 
+#else
+    return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r),
+                         _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r)));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); }
+  __forceinline vfloat8 operator +(const vfloat8& a, float          b) { return a + vfloat8(b); }
+  __forceinline vfloat8 operator +(float          a, const vfloat8& b) { return vfloat8(a) + b; }
+
+  __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); }
+  __forceinline vfloat8 operator -(const vfloat8& a, float          b) { return a - vfloat8(b); }
+  __forceinline vfloat8 operator -(float          a, const vfloat8& b) { return vfloat8(a) - b; }
+
+  __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); }
+  __forceinline vfloat8 operator *(const vfloat8& a, float          b) { return a * vfloat8(b); }
+  __forceinline vfloat8 operator *(float          a, const vfloat8& b) { return vfloat8(a) * b; }
+
+  __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); }
+  __forceinline vfloat8 operator /(const vfloat8& a, float          b) { return a / vfloat8(b); }
+  __forceinline vfloat8 operator /(float          a, const vfloat8& b) { return vfloat8(a) / b; }
+
+  __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); }
+  __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vint8&   b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); }
+
+  __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); }
+  __forceinline vfloat8 min(const vfloat8& a, float          b) { return _mm256_min_ps(a, vfloat8(b)); }
+  __forceinline vfloat8 min(float          a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); }
+
+  __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); }
+  __forceinline vfloat8 max(const vfloat8& a, float          b) { return _mm256_max_ps(a, vfloat8(b)); }
+  __forceinline vfloat8 max(float          a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); }
+
+  /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */
+#if defined(__AVX2__)
+
+  static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_min_epi32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_max_epi32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_min_epu32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_max_epu32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+#else
+
+  static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+    return asFloat(min(asInt(a),asInt(b)));
+  }
+
+  static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+    return asFloat(max(asInt(a),asInt(b)));
+  }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); }
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); }
+#else
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;}
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; }
+  __forceinline vfloat8& operator +=(vfloat8& a, float          b) { return a = a + b; }
+
+  __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; }
+  __forceinline vfloat8& operator -=(vfloat8& a, float          b) { return a = a - b; }
+
+  __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; }
+  __forceinline vfloat8& operator *=(vfloat8& a, float          b) { return a = a * b; }
+
+  __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; }
+  __forceinline vfloat8& operator /=(vfloat8& a, float          b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+
+  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_mask_blend_ps(m, f, t);
+  }
+#elif !defined(__aarch64__)
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);  }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS);  }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS);  }
+
+  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_blendv_ps(f, t, m); 
+  }
+#else
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b);  }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b);  }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b);  }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b);  }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b);  }
+
+  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_blendv_ps(f, t, m);
+  }
+
+#endif
+
+  template<int mask>
+    __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) {
+    return _mm256_blend_ps(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vfloat8& a, const float&   b) { return a == vfloat8(b); }
+  __forceinline vboolf8 operator ==(const float&   a, const vfloat8& b) { return vfloat8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vfloat8& a, const float&   b) { return a != vfloat8(b); }
+  __forceinline vboolf8 operator !=(const float&   a, const vfloat8& b) { return vfloat8(a) != b; }
+
+  __forceinline vboolf8 operator < (const vfloat8& a, const float&   b) { return a <  vfloat8(b); }
+  __forceinline vboolf8 operator < (const float&   a, const vfloat8& b) { return vfloat8(a) <  b; }
+
+  __forceinline vboolf8 operator >=(const vfloat8& a, const float&   b) { return a >= vfloat8(b); }
+  __forceinline vboolf8 operator >=(const float&   a, const vfloat8& b) { return vfloat8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vfloat8& a, const float&   b) { return a >  vfloat8(b); }
+  __forceinline vboolf8 operator > (const float&   a, const vfloat8& b) { return vfloat8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vfloat8& a, const float&   b) { return a <= vfloat8(b); }
+  __forceinline vboolf8 operator <=(const float&   a, const vfloat8& b) { return vfloat8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <  b); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >  b); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); }
+#endif
+
+  __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) {
+    return madd(t,b-a,a);
+  }
+
+  __forceinline bool isvalid (const vfloat8& v) {
+    return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite (const vfloat8& a) {
+    return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+  }
+
+  __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) {
+    return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__aarch64__)
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO       ); }
+  __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#else
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); }
+#endif
+
+
+  __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); }
+  __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vfloat8 shuffle(const vfloat8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1>
+  __forceinline vfloat8 shuffle4(const vfloat8& v) {
+    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) {
+    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat8 shuffle(const vfloat8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) {
+    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+#if !defined(__aarch64__)
+  template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
+  template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
+  template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+#endif
+
+  __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
+  template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
+  template<size_t i> __forceinline vfloat4 extract4   (const vfloat8& a) { return _mm256_extractf128_ps(a, i); }
+  template<>         __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a);   }
+
+  __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); }
+
+#if defined (__AVX2__) && !defined(__aarch64__)
+  static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
+    return _mm256_permutevar8x32_ps(a, index);
+  }
+#endif
+
+#if defined(__AVX512VL__)
+  template<int i>
+  static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) {
+    return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i));
+  }  
+#endif
+
+#if defined (__AVX_I__)
+  template<const int mode>
+  static __forceinline vint4 convert_to_hf16(const vfloat8& a) {
+    return _mm256_cvtps_ph(a, mode);
+  }
+
+  static __forceinline vfloat8 convert_from_hf16(const vint4& a) {
+    return _mm256_cvtph_ps(a);
+  }
+#endif
+
+#if defined(__AVX512VL__)
+  static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+    return align_shift_right<1>(zero,x);
+  }
+#else
+  static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+    const vfloat8 t0 = shuffle<1,2,3,0>(x);
+    const vfloat8 t1 = shuffle4<1,0>(t0);
+    return _mm256_blend_ps(t0,t1,0x88);
+  }
+#endif
+
+  __forceinline vint8 floori(const vfloat8& a) {
+    return vint8(floor(a));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+  {
+    vfloat8 l02 = unpacklo(r0,r2);
+    vfloat8 h02 = unpackhi(r0,r2);
+    vfloat8 l13 = unpacklo(r1,r3);
+    vfloat8 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+    c3 = unpackhi(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2)
+  {
+    vfloat8 l02 = unpacklo(r0,r2);
+    vfloat8 h02 = unpackhi(r0,r2);
+    vfloat8 l13 = unpacklo(r1,r3);
+    vfloat8 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7)
+  {
+    vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3);
+    vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7);
+    c0 = shuffle4<0,2>(h0,h4);
+    c1 = shuffle4<0,2>(h1,h5);
+    c2 = shuffle4<0,2>(h2,h6);
+    c3 = shuffle4<0,2>(h3,h7);
+    c4 = shuffle4<1,3>(h0,h4);
+    c5 = shuffle4<1,3>(h1,h5);
+    c6 = shuffle4<1,3>(h2,h6);
+    c7 = shuffle4<1,3>(h3,h7);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+  {
+    transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2)
+  {
+    transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if !defined(__aarch64__)
+  __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
+#else
+  __forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); }
+  __forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); }
+  __forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); }
+  __forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); }
+  __forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); }
+
+#endif
+  __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) 
+  { 
+    const vfloat8 a = select(valid,v,vfloat8(pos_inf)); 
+    const vbool8 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+
+  __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) 
+  { 
+    const vfloat8 a = select(valid,v,vfloat8(neg_inf)); 
+    const vbool8 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidean Space Operators (pairs of Vec3fa's)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+  //  return vreduce_add4(a*b);
+  //}
+
+  __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+    return _mm256_dp_ps(a,b,0x7F);
+  }
+
+  __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b)
+  {
+    const vfloat8 a0 = a;
+    const vfloat8 b0 = shuffle<1,2,0,3>(b);
+    const vfloat8 a1 = shuffle<1,2,0,3>(a);
+    const vfloat8 b1 = b;
+    return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+  }
+
+  //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); }
+  //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); }
+  //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); }
+  //__forceinline float length     (const vfloat<8>& a) { return sqrt(dot(a,a)); }
+  __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); }
+  //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); }
+  //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  //__forceinline float area    (const vfloat<8>& d) { return 2.0f*halfArea(d); }
+  //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; }
+
+  //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) {
+  //  const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  //}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// In Register Sorting
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 sort_ascending(const vfloat8& v)
+  {
+    const vfloat8 a0 = v;
+    const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat8 c0 = min(a0,b0);
+    const vfloat8 d0 = max(a0,b0);
+    const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat8 c1 = min(a1,b1);
+    const vfloat8 d1 = max(a1,b1);
+    const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+    const vfloat8 c2 = min(a2,b2);
+    const vfloat8 d2 = max(a2,b2);
+    const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vfloat8 b3 = shuffle4<1,0>(a3);
+    const vfloat8 c3 = min(a3,b3);
+    const vfloat8 d3 = max(a3,b3);
+    const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+    const vfloat8 c4 = min(a4,b4);
+    const vfloat8 d4 = max(a4,b4);
+    const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+    const vfloat8 c5 = min(a5,b5);
+    const vfloat8 d5 = max(a5,b5);
+    const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+   __forceinline vfloat8 sort_descending(const vfloat8& v)
+  {
+    const vfloat8 a0 = v;
+    const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat8 c0 = max(a0,b0);
+    const vfloat8 d0 = min(a0,b0);
+    const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat8 c1 = max(a1,b1);
+    const vfloat8 d1 = min(a1,b1);
+    const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+    const vfloat8 c2 = max(a2,b2);
+    const vfloat8 d2 = min(a2,b2);
+    const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vfloat8 b3 = shuffle4<1,0>(a3);
+    const vfloat8 c3 = max(a3,b3);
+    const vfloat8 d3 = min(a3,b3);
+    const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+    const vfloat8 c4 = max(a4,b4);
+    const vfloat8 d4 = min(a4,b4);
+    const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+    const vfloat8 c5 = max(a5,b5);
+    const vfloat8 d5 = min(a5,b5);
+    const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vint16_avx512.h
+++ b/thirdparty/embree/common/simd/vint16_avx512.h
@@ -0,0 +1,472 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 16-wide AVX-512 integer type */
+  template<>
+  struct vint<16>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512i v; 
+      int i[16]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vint() {}
+    __forceinline vint(const vint16& t) { v = t.v; }
+    __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; }
+
+    __forceinline vint(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vint(int i) {
+      v = _mm512_set1_epi32(i);
+    }
+    
+    __forceinline vint(int a, int b, int c, int d) {
+      v = _mm512_set4_epi32(d,c,b,a);      
+    }
+
+    __forceinline vint(int a0 , int a1 , int a2 , int a3,
+                       int a4 , int a5 , int a6 , int a7,
+                       int a8 , int a9 , int a10, int a11,
+                       int a12, int a13, int a14, int a15)
+    {
+      v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+
+    __forceinline vint(const vint4& i) {
+      v = _mm512_broadcast_i32x4(i);
+    }
+
+    __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) {
+      v = _mm512_castsi128_si512(a);
+      v = _mm512_inserti32x4(v, b, 1);
+      v = _mm512_inserti32x4(v, c, 2);
+      v = _mm512_inserti32x4(v, d, 3);
+    }
+
+    __forceinline vint(const vint8& i) {
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+    }
+
+    __forceinline vint(const vint8& a, const vint8& b) {
+      v = _mm512_castsi256_si512(a);
+      v = _mm512_inserti64x4(v, b, 1);
+    }
+   
+    __forceinline explicit vint(const __m512& f) {
+      v = _mm512_cvtps_epi32(f);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vint(ZeroTy)   : v(_mm512_setzero_epi32()) {}
+    __forceinline vint(OneTy)    : v(_mm512_set1_epi32(1)) {}
+    __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {}
+    __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {}
+    __forceinline vint(StepTy)   : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); }
+
+    static __forceinline vint16 load(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); }
+
+    static __forceinline vint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+    static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); }
+
+    static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); }
+    static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); }
+
+    static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); }
+ 
+    static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr,  const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); }
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); }
+
+    static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) {
+      return _mm512_mask_compress_epi32(v,mask,v);
+    }
+
+    static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) {
+      return _mm512_mask_compress_epi32(a,mask,b);
+    }
+
+    static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) {
+      return _mm512_mask_expand_epi32(b,mask,a);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const int* ptr, const vint16& index) {
+      return _mm512_i32gather_epi32(index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) {
+      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) {
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       int& operator [](size_t index)       { assert(index < 16); return i[index]; }
+    __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+    __forceinline unsigned int uint    (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+    __forceinline size_t&      uint64_t(size_t index) const { assert(index < 8);  return ((size_t*)i)[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); }
+
+  __forceinline vint16 operator +(const vint16& a) { return a; }
+  __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vint16 operator +(const vint16& a, int           b) { return a + vint16(b); }
+  __forceinline vint16 operator +(int           a, const vint16& b) { return vint16(a) + b; }
+
+  __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vint16 operator -(const vint16& a, int           b) { return a - vint16(b); }
+  __forceinline vint16 operator -(int           a, const vint16& b) { return vint16(a) - b; }
+
+  __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); }
+  __forceinline vint16 operator *(const vint16& a, int           b) { return a * vint16(b); }
+  __forceinline vint16 operator *(int           a, const vint16& b) { return vint16(a) * b; }
+
+  __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vint16 operator &(const vint16& a, int           b) { return a & vint16(b); }
+  __forceinline vint16 operator &(int           a, const vint16& b) { return vint16(a) & b; }
+
+  __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vint16 operator |(const vint16& a, int           b) { return a | vint16(b); }
+  __forceinline vint16 operator |(int           a, const vint16& b) { return vint16(a) | b; }
+
+  __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vint16 operator ^(const vint16& a, int           b) { return a ^ vint16(b); }
+  __forceinline vint16 operator ^(int           a, const vint16& b) { return vint16(a) ^ b; }
+
+  __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); }
+  __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); }
+
+  __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); }
+  __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); }
+
+  __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); }
+  __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); }
+  __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); }
+  
+  __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); }
+  __forceinline vint16 min(const vint16& a, int           b) { return min(a,vint16(b)); }
+  __forceinline vint16 min(int           a, const vint16& b) { return min(vint16(a),b); }
+
+  __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); }
+  __forceinline vint16 max(const vint16& a, int           b) { return max(a,vint16(b)); }
+  __forceinline vint16 max(int           a, const vint16& b) { return max(vint16(a),b); }
+  
+  __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); }
+
+  __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+  __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+  __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+  __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; }
+  __forceinline vint16& operator +=(vint16& a, int           b) { return a = a + b; }
+  
+  __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; }
+  __forceinline vint16& operator -=(vint16& a, int           b) { return a = a - b; }
+
+  __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; }
+  __forceinline vint16& operator *=(vint16& a, int           b) { return a = a * b; }
+  
+  __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; }
+  __forceinline vint16& operator &=(vint16& a, int           b) { return a = a & b; }
+  
+  __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; }
+  __forceinline vint16& operator |=(vint16& a, int           b) { return a = a | b; }
+  
+  __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; }
+  __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vint16& a, int           b) { return a == vint16(b); }
+  __forceinline vboolf16 operator ==(int           a, const vint16& b) { return vint16(a) == b; }
+  
+  __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vint16& a, int           b) { return a != vint16(b); }
+  __forceinline vboolf16 operator !=(int           a, const vint16& b) { return vint16(a) != b; }
+  
+  __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vint16& a, int           b) { return a <  vint16(b); }
+  __forceinline vboolf16 operator < (int           a, const vint16& b) { return vint16(a) <  b; }
+  
+  __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vint16& a, int           b) { return a >= vint16(b); }
+  __forceinline vboolf16 operator >=(int           a, const vint16& b) { return vint16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vint16& a, int           b) { return a >  vint16(b); }
+  __forceinline vboolf16 operator > (int           a, const vint16& b) { return vint16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vint16& a, int           b) { return a <= vint16(b); }
+  __forceinline vboolf16 operator <=(int           a, const vint16& b) { return vint16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+    
+ 
+  __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) {
+    return _mm512_mask_or_epi32(f,m,t,t); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); }
+  __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); }
+
+  template<int i>
+    __forceinline vint16 shuffle(const vint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint16 shuffle(const vint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vint16 shuffle4(const vint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint16 shuffle4(const vint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) {
+    return _mm512_alignr_epi32(a, b, i);
+  };
+
+  __forceinline int toScalar(const vint16& v) {
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+  }
+
+  template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); }
+
+  template<int N, int i>
+  vint<N> extractN(const vint16& v);
+
+  template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+  template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); }
+  template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); }
+  template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); }
+
+  template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+  template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); }
+
+  template<int i> __forceinline vint4 extract4   (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); }
+  template<>      __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+
+  template<int i> __forceinline vint8 extract8   (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); }
+  template<>      __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 vreduce_min2(vint16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vint16 vreduce_max2(vint16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vint16 vreduce_and2(vint16 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+  __forceinline vint16 vreduce_or2(vint16 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+  __forceinline vint16 vreduce_add2(vint16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+  
+  __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); }
+  __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); }
+  __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 conflict(const vint16& index)
+  {
+    return _mm512_conflict_epi32(index);
+  }
+
+  __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index)
+  {
+    return _mm512_mask_conflict_epi32(dest,mask,index);
+  }    
+
+  __forceinline vint16 convert_uint32_t(const __m512& f) {
+    return _mm512_cvtps_epu32(f);
+  }
+
+  __forceinline vint16 permute(vint16 v, vint16 index) {
+    return _mm512_permutexvar_epi32(index,v);  
+  }
+
+  __forceinline vint16 reverse(const vint16 &a) {
+    return permute(a,vint16(reverse_step));
+  }
+
+  __forceinline vint16 prefix_sum(const vint16& a) 
+  {
+    const vint16 z(zero);
+    vint16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vint16 reverse_prefix_sum(const vint16& a) 
+  {
+    const vint16 z(zero);
+    vint16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  /* this should use a vbool8 and a vint8_64...*/
+  template<int scale = 1, int hint = _MM_HINT_T0>
+    __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset)
+  {
+#if defined(__AVX512PF__)
+    _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint);
+#endif
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vint4_sse2.h
+++ b/thirdparty/embree/common/simd/vint4_sse2.h
@@ -0,0 +1,652 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/emath.h"
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE integer type */
+  template<>
+  struct vint<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };             // number of SIMD elements
+    union { __m128i v; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vint() {}
+    __forceinline vint(const vint4& a) { v = a.v; }
+    __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m128i a) : v(a) {}
+    __forceinline operator const __m128i&() const { return v; }
+    __forceinline operator       __m128i&()       { return v; }
+
+    __forceinline vint(int a) : v(_mm_set1_epi32(a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+    __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {}
+#if defined(__AVX512VL__)
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+    __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm_setzero_si128()) {}
+    __forceinline vint(OneTy)         : v(_mm_set_epi32(1, 1, 1, 1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm_set_epi32(3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {}
+
+    __forceinline vint(TrueTy)   { v = _mm_cmpeq_epi32(v,v); }
+    __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+    static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+    static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+    
+#if defined(__AVX512VL__)
+
+    static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
+      return _mm_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) {
+      return _mm_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+
+#if defined(__aarch64__)
+    static __forceinline vint4 load(const unsigned char* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+    static __forceinline vint4 loadu(const unsigned char* ptr) {
+        return  _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vint4 load(const unsigned char* ptr) {
+      return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+    static __forceinline vint4 loadu(const unsigned char* ptr) {
+      return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+#else
+
+    static __forceinline vint4 load(const unsigned char* ptr) {
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    } 
+
+    static __forceinline vint4 loadu(const unsigned char* ptr) {
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+#endif
+
+    static __forceinline vint4 load(const unsigned short* ptr) {
+#if defined(__aarch64__)
+      return __m128i(vmovl_u16(vld1_u16(ptr)));
+#elif defined (__SSE4_1__)
+      return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+    } 
+
+    static __forceinline void store(unsigned char* ptr, const vint4& v) {
+#if defined(__aarch64__)
+        int32x4_t x = v;
+        uint16x4_t y = vqmovn_u32(uint32x4_t(x));
+        uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
+        vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0);
+#elif defined(__SSE4_1__)
+      __m128i x = v;
+      x = _mm_packus_epi32(x, x);
+      x = _mm_packus_epi16(x, x);
+      *(int*)ptr = _mm_cvtsi128_si32(x);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (unsigned char)v[i];
+#endif
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint4& v) {
+#if defined(__aarch64__)
+      uint32x4_t x = uint32x4_t(v.v);
+      uint16x4_t y = vqmovn_u32(x);
+      vst1_u16(ptr, y);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (unsigned short)v[i];
+#endif
+    }
+
+    static __forceinline vint4 load_nt(void* ptr) {
+#if defined(__aarch64__) || defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr);
+#else
+      return _mm_load_si128((__m128i*)ptr); 
+#endif
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vint4& v) {
+#if !defined(__aarch64__) && defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
+#else
+      _mm_store_si128((__m128i*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vint4 gather(const int* ptr, const vint4& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_i32gather_epi32(ptr, index, scale);
+#else
+      return vint4(
+          *(int*)(((char*)ptr)+scale*index[0]),
+          *(int*)(((char*)ptr)+scale*index[1]),
+          *(int*)(((char*)ptr)+scale*index[2]),
+          *(int*)(((char*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) {
+      vint4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_i32scatter_epi32((int*)ptr, index, v, scale);
+#else
+      *(int*)(((char*)ptr)+scale*index[0]) = v[0];
+      *(int*)(((char*)ptr)+scale*index[1]) = v[1];
+      *(int*)(((char*)ptr)+scale*index[2]) = v[2];
+      *(int*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
+#else
+      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+#if defined(__x86_64__) || defined(__aarch64__)
+    static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+
+    friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__aarch64__)
+      return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v));
+#elif defined(__SSE4_1__)
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+#else
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); }
+#else
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+  __forceinline vint4 operator +(const vint4& a) { return a; }
+  __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+#if defined(__aarch64__)
+  __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); }
+#elif defined(__SSSE3__)
+  __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vint4 operator +(const vint4& a, int          b) { return a + vint4(b); }
+  __forceinline vint4 operator +(int          a, const vint4& b) { return vint4(a) + b; }
+
+  __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vint4 operator -(const vint4& a, int          b) { return a - vint4(b); }
+  __forceinline vint4 operator -(int          a, const vint4& b) { return vint4(a) - b; }
+
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
+#else
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+#endif
+  __forceinline vint4 operator *(const vint4& a, int          b) { return a * vint4(b); }
+  __forceinline vint4 operator *(int          a, const vint4& b) { return vint4(a) * b; }
+
+  __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vint4 operator &(const vint4& a, int          b) { return a & vint4(b); }
+  __forceinline vint4 operator &(int          a, const vint4& b) { return vint4(a) & b; }
+
+  __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vint4 operator |(const vint4& a, int          b) { return a | vint4(b); }
+  __forceinline vint4 operator |(int          a, const vint4& b) { return vint4(a) | b; }
+
+  __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vint4 operator ^(const vint4& a, int          b) { return a ^ vint4(b); }
+  __forceinline vint4 operator ^(int          a, const vint4& b) { return vint4(a) ^ b; }
+
+  __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); }
+
+  __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
+  __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
+  __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; }
+  __forceinline vint4& operator +=(vint4& a, int          b) { return a = a + b; }
+  
+  __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
+  __forceinline vint4& operator -=(vint4& a, int          b) { return a = a - b; }
+
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+  __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
+  __forceinline vint4& operator *=(vint4& a, int          b) { return a = a * b; }
+#endif
+  
+  __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; }
+  __forceinline vint4& operator &=(vint4& a, int          b) { return a = a & b; }
+  
+  __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; }
+  __forceinline vint4& operator |=(vint4& a, int          b) { return a = a | b; }
+  
+  __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; }
+  __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a <  b); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a >  b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vint4& a, int          b) { return a == vint4(b); }
+  __forceinline vboolf4 operator ==(int          a, const vint4& b) { return vint4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vint4& a, int          b) { return a != vint4(b); }
+  __forceinline vboolf4 operator !=(int          a, const vint4& b) { return vint4(a) != b; }
+
+  __forceinline vboolf4 operator < (const vint4& a, int          b) { return a <  vint4(b); }
+  __forceinline vboolf4 operator < (int          a, const vint4& b) { return vint4(a) <  b; }
+
+  __forceinline vboolf4 operator >=(const vint4& a, int          b) { return a >= vint4(b); }
+  __forceinline vboolf4 operator >=(int          a, const vint4& b) { return vint4(a) >= b; }
+
+  __forceinline vboolf4 operator > (const vint4& a, int          b) { return a >  vint4(b); }
+  __forceinline vboolf4 operator > (int          a, const vint4& b) { return vint4(a) >  b; }
+
+  __forceinline vboolf4 operator <=(const vint4& a, int          b) { return a <= vint4(b); }
+  __forceinline vboolf4 operator <=(int          a, const vint4& b) { return vint4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; }
+  __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a <  b; }
+  __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; }
+  __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a >  b; }
+  __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <  b); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >  b); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+  __forceinline vint4 select(const vint4& t, const vint4& f) {
+#if defined(__SSE4_1__) 
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+    return select(vboolf4(mask), t, f);
+#endif    
+  }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
+
+  __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); }
+  __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); }
+#endif
+
+  __forceinline vint4 min(const vint4& a, int          b) { return min(a,vint4(b)); }
+  __forceinline vint4 min(int          a, const vint4& b) { return min(vint4(a),b); }
+  __forceinline vint4 max(const vint4& a, int          b) { return max(a,vint4(b)); }
+  __forceinline vint4 max(int          a, const vint4& b) { return max(vint4(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+#if defined(__aarch64__)
+    template<int i0, int i1, int i2, int i3>
+    __forceinline vint4 shuffle(const vint4& v) {
+        return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+    }
+    template<int i0, int i1, int i2, int i3>
+    __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+        return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+    }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint4 shuffle(const vint4& v) {
+    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+#endif
+#if defined(__SSE3__)
+  template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vint4 shuffle(const vint4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__SSE4_1__) && !defined(__aarch64__)
+  template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
+  template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
+#else
+  template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; }
+  template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+  template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
+  
+  __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
+  
+#if defined(__aarch64__)
+  __forceinline size_t toSizeT(const vint4& v) {
+    uint64x2_t x = uint64x2_t(v.v);
+    return x[0];
+  }
+#else
+__forceinline size_t toSizeT(const vint4& v) { 
+#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
+    return toScalar(v);
+#elif defined(__ARM_NEON)
+    // FIXME(LTE): Do we need a swap(i.e. use lane 1)?
+    return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0);
+#else
+    return _mm_cvtsi128_si64(v); 
+#endif
+  }
+#endif
+
+#if defined(__AVX512VL__)
+
+  __forceinline vint4 permute(const vint4 &a, const vint4 &index) {
+    return  _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index));
+  }
+
+  template<int i>
+  __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
+    return _mm_alignr_epi32(a, b, i);    
+  }  
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+
+#if defined(__aarch64__)
+    __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); }
+
+    __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); }
+    __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); }
+    __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); }
+#else
+  __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
+#endif
+
+  __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+  __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); }
+  __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); }
+  __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+
+  __forceinline vint4 usort_ascending(const vint4& v)
+  {
+    const vint4 a0 = v;
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = umin(a0,b0);
+    const vint4 d0 = umax(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = umin(a1,b1);
+    const vint4 d1 = umax(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = umin(a2,b2);
+    const vint4 d2 = umax(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  __forceinline vint4 usort_descending(const vint4& v)
+  {
+    const vint4 a0 = v;
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = umax(a0,b0);
+    const vint4 d0 = umin(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = umax(a1,b1);
+    const vint4 d1 = umin(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = umax(a2,b2);
+    const vint4 d2 = umin(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+#else
+
+  __forceinline vint4 usort_ascending(const vint4& v)
+  {
+    const vint4 a0 = v-vint4(0x80000000);
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = min(a0,b0);
+    const vint4 d0 = max(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = min(a1,b1);
+    const vint4 d1 = max(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = min(a2,b2);
+    const vint4 d2 = max(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3+vint4(0x80000000);
+  }
+
+  __forceinline vint4 usort_descending(const vint4& v)
+  {
+    const vint4 a0 = v-vint4(0x80000000);
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = max(a0,b0);
+    const vint4 d0 = min(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = max(a1,b1);
+    const vint4 d1 = min(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = max(a2,b2);
+    const vint4 d2 = min(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3+vint4(0x80000000);
+  }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vint8_avx.h
+++ b/thirdparty/embree/common/simd/vint8_avx.h
@@ -0,0 +1,470 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vint<8>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };        // number of SIMD elements
+    union {                    // data
+      __m256i v;
+      struct { __m128i vl,vh; };
+      int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint8& a) { v = a.v; }
+    __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+ 
+    __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm256_setzero_si256()) {}
+    __forceinline vint(OneTy)         : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+    __forceinline vint(UndefinedTy)   : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+    static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+    static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
+
+    static __forceinline void store_nt(void* ptr, const vint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline vint8 load(const unsigned char* ptr) {
+      vint4 il = vint4::load(ptr+0);
+      vint4 ih = vint4::load(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 loadu(const unsigned char* ptr) {
+      vint4 il = vint4::loadu(ptr+0);
+      vint4 ih = vint4::loadu(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 load(const unsigned short* ptr) {
+      vint4 il = vint4::load(ptr+0);
+      vint4 ih = vint4::load(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 loadu(const unsigned short* ptr) {
+      vint4 il = vint4::loadu(ptr+0);
+      vint4 ih = vint4::loadu(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vint8& i) {
+      vint4 il(i.vl);
+      vint4 ih(i.vh);
+      vint4::store(ptr + 0,il);
+      vint4::store(ptr + 4,ih);
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const int* ptr, const vint8& index) {
+      return vint8(
+          *(int*)(((char*)ptr)+scale*index[0]),
+          *(int*)(((char*)ptr)+scale*index[1]),
+          *(int*)(((char*)ptr)+scale*index[2]),
+          *(int*)(((char*)ptr)+scale*index[3]),
+          *(int*)(((char*)ptr)+scale*index[4]),
+          *(int*)(((char*)ptr)+scale*index[5]),
+          *(int*)(((char*)ptr)+scale*index[6]),
+          *(int*)(((char*)ptr)+scale*index[7]));
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) {
+      vint8 r = zero;
+      if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(int*)(((char*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(int*)(((char*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(int*)(((char*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(int*)(((char*)ptr)+scale*index[7]);
+      return r;
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+    {
+      *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+    {
+      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+
+    static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+
+  __forceinline vint8 operator +(const vint8& a) { return a; }
+  __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); }
+  __forceinline vint8 abs       (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
+  __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
+
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
+  __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
+
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
+  __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
+
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
+  __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
+
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
+  __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
+
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
+  __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
+
+  __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); }
+
+  __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+  __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+  __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+  
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); }
+  __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
+  __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
+
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); }
+  __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
+  __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
+
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+  __forceinline vint8 umin(const vint8& a, int          b) { return umin(a,vint8(b)); }
+  __forceinline vint8 umin(int          a, const vint8& b) { return umin(vint8(a),b); }
+
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+  __forceinline vint8 umax(const vint8& a, int          b) { return umax(a,vint8(b)); }
+  __forceinline vint8 umax(int          a, const vint8& b) { return umax(vint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+  __forceinline vint8& operator +=(vint8& a, int          b) { return a = a + b; }
+  
+  __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+  __forceinline vint8& operator -=(vint8& a, int          b) { return a = a - b; }
+  
+  __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+  __forceinline vint8& operator *=(vint8& a, int          b) { return a = a * b; }
+  
+  __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+  __forceinline vint8& operator &=(vint8& a, int          b) { return a = a & b; }
+  
+  __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+  __forceinline vint8& operator |=(vint8& a, int          b) { return a = a | b; }
+  
+  __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; }
+  __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
+  __forceinline vboolf8 operator ==(int          a, const vint8& b) { return vint8(a) == b; }
+  
+  __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+  __forceinline vboolf8 operator !=(const vint8& a, int          b) { return a != vint8(b); }
+  __forceinline vboolf8 operator !=(int          a, const vint8& b) { return vint8(a) != b; }
+  
+  __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator < (const vint8& a, int          b) { return a <  vint8(b); }
+  __forceinline vboolf8 operator < (int          a, const vint8& b) { return vint8(a) <  b; }
+  
+  __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
+  __forceinline vboolf8 operator >=(const vint8& a, int          b) { return a >= vint8(b); }
+  __forceinline vboolf8 operator >=(int          a, const vint8& b) { return vint8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator > (const vint8& a, int          b) { return a >  vint8(b); }
+  __forceinline vboolf8 operator > (int          a, const vint8& b) { return vint8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
+  __forceinline vboolf8 operator <=(const vint8& a, int          b) { return a <= vint8(b); }
+  __forceinline vboolf8 operator <=(int          a, const vint8& b) { return vint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+  __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <  b); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >  b); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+
+  __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+  template<int i>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 usort_ascending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umin(a0,b0);
+    const vint8 d0 = umax(a0,b0);
+    const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umin(a1,b1);
+    const vint8 d1 = umax(a1,b1);
+    const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umin(a2,b2);
+    const vint8 d2 = umax(a2,b2);
+    const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umin(a3,b3);
+    const vint8 d3 = umax(a3,b3);
+    const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umin(a4,b4);
+    const vint8 d4 = umax(a4,b4);
+    const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umin(a5,b5);
+    const vint8 d5 = umax(a5,b5);
+    const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+    return a6;
+  }
+
+  __forceinline vint8 usort_descending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umax(a0,b0);
+    const vint8 d0 = umin(a0,b0);
+    const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umax(a1,b1);
+    const vint8 d1 = umin(a1,b1);
+    const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umax(a2,b2);
+    const vint8 d2 = umin(a2,b2);
+    const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umax(a3,b3);
+    const vint8 d3 = umin(a3,b3);
+    const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umax(a4,b4);
+    const vint8 d4 = umin(a4,b4);
+    const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umax(a5,b5);
+    const vint8 d5 = umin(a5,b5);
+    const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vint8_avx2.h
+++ b/thirdparty/embree/common/simd/vint8_avx2.h
@@ -0,0 +1,522 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vint<8>
+  {
+    ALIGNED_STRUCT_(32);
+
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m256i v;
+      int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint8& a) { v = a.v; }
+    __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ 
+    __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm256_setzero_si256()) {}
+    __forceinline vint(OneTy)         : v(_mm256_set1_epi32(1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm256_set1_epi32(pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm256_set1_epi32(neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+    __forceinline vint(UndefinedTy)   : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint8 load(const unsigned char* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+    static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+    static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+    static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) {
+      return _mm256_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) {
+      return _mm256_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+    
+    static __forceinline vint8 load_nt(void* ptr) {
+      return _mm256_stream_load_si256((__m256i*)ptr);
+    }
+
+    static __forceinline void store_nt(void* ptr, const vint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vint8& i)
+    {
+      for (size_t j=0; j<8; j++)
+        ptr[j] = i[j];
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const int *const ptr, const vint8& index) {
+      return _mm256_i32gather_epi32(ptr, index, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) {
+      vint8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#else
+      return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+      *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); }
+#else
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+  __forceinline vint8 operator +(const vint8& a) { return a; }
+  __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); }
+  __forceinline vint8 abs       (const vint8& a) { return _mm256_abs_epi32(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
+  __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
+
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
+  __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
+
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); }
+  __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
+  __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
+
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
+  __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
+
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
+  __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
+
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
+  __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
+
+  __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); }
+
+  __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); }
+  __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); }
+
+  __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); }
+  __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); }
+  __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); }
+
+  __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); }
+  __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); }
+  __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); }
+  
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); }
+  __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
+  __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
+
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); }
+  __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
+  __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
+
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+  __forceinline vint8& operator +=(vint8& a, int          b) { return a = a + b; }
+  
+  __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+  __forceinline vint8& operator -=(vint8& a, int          b) { return a = a - b; }
+  
+  __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+  __forceinline vint8& operator *=(vint8& a, int          b) { return a = a * b; }
+  
+  __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+  __forceinline vint8& operator &=(vint8& a, int          b) { return a = a & b; }
+  
+  __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+  __forceinline vint8& operator |=(vint8& a, int          b) { return a = a | b; }
+  
+  __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; }
+  __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+
+  static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+  }
+#else
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
+
+  static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+  }
+#endif
+
+  template<int mask>
+  __forceinline vint8 select(const vint8& t, const vint8& f) {
+    return _mm256_blend_epi32(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
+  __forceinline vboolf8 operator ==(int          a, const vint8& b) { return vint8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vint8& a, int          b) { return a != vint8(b); }
+  __forceinline vboolf8 operator !=(int          a, const vint8& b) { return vint8(a) != b; }
+
+  __forceinline vboolf8 operator < (const vint8& a, int          b) { return a <  vint8(b); }
+  __forceinline vboolf8 operator < (int          a, const vint8& b) { return vint8(a) <  b; }
+
+  __forceinline vboolf8 operator >=(const vint8& a, int          b) { return a >= vint8(b); }
+  __forceinline vboolf8 operator >=(int          a, const vint8& b) { return vint8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vint8& a, int          b) { return a >  vint8(b); }
+  __forceinline vboolf8 operator > (int          a, const vint8& b) { return vint8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vint8& a, int          b) { return a <= vint8(b); }
+  __forceinline vboolf8 operator <=(int          a, const vint8& b) { return vint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <  b); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >  b); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+  template<int i>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+#if !defined(__aarch64__)
+  __forceinline vint8 permute(const vint8& v, const __m256i& index) {
+    return _mm256_permutevar8x32_epi32(v, index);
+  }
+
+  __forceinline vint8 shuffle(const vint8& v, const __m256i& index) {
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+  }
+
+  template<int i>
+  static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) {
+#if defined(__AVX512VL__)
+    return _mm256_alignr_epi32(a, b, i);    
+#else
+    return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+  }  
+
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 usort_ascending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umin(a0,b0);
+    const vint8 d0 = umax(a0,b0);
+    const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umin(a1,b1);
+    const vint8 d1 = umax(a1,b1);
+    const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umin(a2,b2);
+    const vint8 d2 = umax(a2,b2);
+    const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umin(a3,b3);
+    const vint8 d3 = umax(a3,b3);
+    const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umin(a4,b4);
+    const vint8 d4 = umax(a4,b4);
+    const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umin(a5,b5);
+    const vint8 d5 = umax(a5,b5);
+    const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  __forceinline vint8 usort_descending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umax(a0,b0);
+    const vint8 d0 = umin(a0,b0);
+    const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umax(a1,b1);
+    const vint8 d1 = umin(a1,b1);
+    const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umax(a2,b2);
+    const vint8 d2 = umin(a2,b2);
+    const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umax(a3,b3);
+    const vint8 d3 = umin(a3,b3);
+    const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umax(a4,b4);
+    const vint8 d4 = umin(a4,b4);
+    const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umax(a5,b5);
+    const vint8 d5 = umin(a5,b5);
+    const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vllong4_avx2.h
+++ b/thirdparty/embree/common/simd/vllong4_avx2.h
@@ -0,0 +1,352 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 4-wide AVX2 64-bit long long type */
+  template<>
+  struct vllong<4>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboold4 Bool;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union {             // data
+      __m256i v; 
+      long long i[4];
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vllong() {}
+    __forceinline vllong(const vllong4& t) { v = t.v; }
+    __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; }
+
+    __forceinline vllong(const __m256i& t) { v = t; }
+    __forceinline operator __m256i() const { return v; }
+    __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); }
+
+
+    __forceinline vllong(long long i) {
+      v = _mm256_set1_epi64x(i);
+    }
+    
+    __forceinline vllong(long long a, long long b, long long c, long long d) {
+      v = _mm256_set_epi64x(d,c,b,a);      
+    }
+   
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {}
+    __forceinline vllong(OneTy)  : v(_mm256_set1_epi64x(1)) {}
+    __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {}
+    __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a));
+    }
+
+    static __forceinline vllong4 loadu(const void* addr)
+    {
+      return _mm256_loadu_si256((__m256i*)addr);
+    }
+
+    static __forceinline vllong4 load(const vllong4* addr) {
+      return _mm256_load_si256((__m256i*)addr);
+    }
+
+    static __forceinline vllong4 load(const long long* addr) {
+      return _mm256_load_si256((__m256i*)addr);
+    }
+
+    static __forceinline void store(void* ptr, const vllong4& v) {
+      _mm256_store_si256((__m256i*)ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vllong4& v) {
+      _mm256_storeu_si256((__m256i*)ptr,v);
+    }
+
+    static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+      _mm256_mask_storeu_epi64(ptr,mask,f);
+#else
+      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+    }
+
+    static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+      _mm256_mask_store_epi64(ptr,mask,f);
+#else
+      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       long long& operator [](size_t index)       { assert(index < 4); return i[index]; }
+    __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) {
+  #if defined(__AVX512VL__)
+    return _mm256_mask_blend_epi64(m, f, t);
+  #else
+    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m));
+  #endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); }
+#else
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); }
+#endif
+
+  __forceinline vllong4 operator +(const vllong4& a) { return a; }
+  __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); }
+  __forceinline vllong4 operator +(const vllong4& a, long long      b) { return a + vllong4(b); }
+  __forceinline vllong4 operator +(long long      a, const vllong4& b) { return vllong4(a) + b; }
+
+  __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); }
+  __forceinline vllong4 operator -(const vllong4& a, long long      b) { return a - vllong4(b); }
+  __forceinline vllong4 operator -(long long      a, const vllong4& b) { return vllong4(a) - b; }
+
+  /* only low 32bit part */
+  __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); }
+  __forceinline vllong4 operator *(const vllong4& a, long long      b) { return a * vllong4(b); }
+  __forceinline vllong4 operator *(long long      a, const vllong4& b) { return vllong4(a) * b; }
+
+  __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); }
+  __forceinline vllong4 operator &(const vllong4& a, long long      b) { return a & vllong4(b); }
+  __forceinline vllong4 operator &(long long      a, const vllong4& b) { return vllong4(a) & b; }
+
+  __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); }
+  __forceinline vllong4 operator |(const vllong4& a, long long      b) { return a | vllong4(b); }
+  __forceinline vllong4 operator |(long long      a, const vllong4& b) { return vllong4(a) | b; }
+
+  __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vllong4 operator ^(const vllong4& a, long long      b) { return a ^ vllong4(b); }
+  __forceinline vllong4 operator ^(long long      a, const vllong4& b) { return vllong4(a) ^ b; }
+
+  __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); }
+  //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); }
+
+  __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); }
+  //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); }
+  //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); }
+
+  __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); }
+  
+  //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); }
+  //__forceinline vllong4 min(const vllong4& a, long long      b) { return min(a,vllong4(b)); }
+  //__forceinline vllong4 min(long long      a, const vllong4& b) { return min(vllong4(a),b); }
+
+  //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); }
+  //__forceinline vllong4 max(const vllong4& a, long long      b) { return max(a,vllong4(b)); }
+  //__forceinline vllong4 max(long long      a, const vllong4& b) { return max(vllong4(a),b); }
+
+#if defined(__AVX512VL__)
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); }
+#else
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; }
+  __forceinline vllong4& operator +=(vllong4& a, long long      b) { return a = a + b; }
+  
+  __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; }
+  __forceinline vllong4& operator -=(vllong4& a, long long      b) { return a = a - b; }
+
+  __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; }
+  __forceinline vllong4& operator *=(vllong4& a, long long      b) { return a = a * b; }
+  
+  __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; }
+  __forceinline vllong4& operator &=(vllong4& a, long long      b) { return a = a & b; }
+  
+  __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; }
+  __forceinline vllong4& operator |=(vllong4& a, long long      b) { return a = a | b; }
+  
+  __forceinline vllong4& operator <<=(vllong4& a, long long      b) { return a = a << b; }
+  //__forceinline vllong4& operator >>=(vllong4& a, long long      b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); }
+#endif
+
+  __forceinline vboold4 operator ==(const vllong4& a, long long      b) { return a == vllong4(b); }
+  __forceinline vboold4 operator ==(long long      a, const vllong4& b) { return vllong4(a) == b; }
+
+  __forceinline vboold4 operator !=(const vllong4& a, long long      b) { return a != vllong4(b); }
+  __forceinline vboold4 operator !=(long long      a, const vllong4& b) { return vllong4(a) != b; }
+
+  __forceinline vboold4 operator > (const vllong4& a, long long      b) { return a >  vllong4(b); }
+  __forceinline vboold4 operator > (long long      a, const vllong4& b) { return vllong4(a) >  b; }
+
+  __forceinline vboold4 operator < (const vllong4& a, long long      b) { return a <  vllong4(b); }
+  __forceinline vboold4 operator < (long long      a, const vllong4& b) { return vllong4(a) <  b; }
+
+  __forceinline vboold4 operator >=(const vllong4& a, long long      b) { return a >= vllong4(b); }
+  __forceinline vboold4 operator >=(long long      a, const vllong4& b) { return vllong4(a) >= b; }
+
+  __forceinline vboold4 operator <=(const vllong4& a, long long      b) { return a <= vllong4(b); }
+  __forceinline vboold4 operator <=(long long      a, const vllong4& b) { return vllong4(a) <= b; }
+
+  __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; }
+  __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; }
+  __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a <  b; }
+  __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; }
+  __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a >  b; }
+  __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <  b); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >  b); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vllong4 shuffle(const vllong4& v) {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+  }
+
+  template<int i>
+  __forceinline vllong4 shuffle(const vllong4& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1>
+  __forceinline vllong4 shuffle2(const vllong4& v) {
+    return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0));
+  }
+
+  __forceinline long long toScalar(const vllong4& v) {
+    return _mm_cvtsi128_si64(_mm256_castsi256_si128(v));
+  }
+
+#if defined(__AVX512VL__)
+  __forceinline vllong4 permute(const vllong4& a, const __m256i& index) {
+    // workaround for GCC 7.x
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+    return _mm256_permutex2var_epi64(a,index,a);
+#else
+    return _mm256_permutexvar_epi64(index,a);
+#endif
+  }
+
+  __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) {
+    return _mm256_permutex2var_epi64(a,index,b);
+  }
+
+#endif
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+
+  __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+  __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+  __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+  __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); }
+  __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); }
+  __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<4; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vllong8_avx512.h
+++ b/thirdparty/embree/common/simd/vllong8_avx512.h
@@ -0,0 +1,358 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 8-wide AVX-512 64-bit long long type */
+  template<>
+  struct vllong<8>
+  {
+    ALIGNED_STRUCT_(64);
+        
+    typedef vboold8 Bool;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m512i v; 
+      long long i[8];
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vllong() {}
+    __forceinline vllong(const vllong8& t) { v = t.v; }
+    __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; }
+
+    __forceinline vllong(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vllong(long long i) {
+      v = _mm512_set1_epi64(i);
+    }
+    
+    __forceinline vllong(long long a, long long b, long long c, long long d) {
+      v = _mm512_set4_epi64(d,c,b,a);      
+    }
+
+    __forceinline vllong(long long a0, long long a1, long long a2, long long a3,
+                         long long a4, long long a5, long long a6, long long a7)
+    {
+      v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+   
+    __forceinline vllong(const vllong<4>& i) {
+      v = _mm512_broadcast_i64x4(i);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {}
+    __forceinline vllong(OneTy)  : v(_mm512_set1_epi64(1)) {}
+    __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {}
+    __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) {
+      _mm512_stream_si512((__m512i*)ptr,a);
+    }
+
+    static __forceinline vllong8 loadu(const void* addr) {
+      return _mm512_loadu_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const vllong8* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const long long* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const unsigned char* ptr) {
+      return _mm512_cvtepu8_epi64(*(__m128i*)ptr); 
+    }
+
+    static __forceinline void store(void* ptr, const vllong8& v) {
+      _mm512_store_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vllong8& v) {
+      _mm512_storeu_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) {
+      _mm512_mask_storeu_epi64(ptr,mask,f);
+    }
+
+    static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) {
+      _mm512_mask_store_epi64(addr,mask,v2);
+    }
+
+    static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) {
+      return _mm512_mask_compress_epi64(a,mask,b);
+    }
+
+    static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) {
+      return _mm512_mask_expand_epi64(b,mask,a);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       long long& operator [](size_t index)       { assert(index < 8); return i[index]; }
+    __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); }
+
+  __forceinline vllong8 operator +(const vllong8& a) { return a; }
+  __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); }
+  __forceinline vllong8 operator +(const vllong8& a, long long      b) { return a + vllong8(b); }
+  __forceinline vllong8 operator +(long long      a, const vllong8& b) { return vllong8(a) + b; }
+
+  __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); }
+  __forceinline vllong8 operator -(const vllong8& a, long long      b) { return a - vllong8(b); }
+  __forceinline vllong8 operator -(long long      a, const vllong8& b) { return vllong8(a) - b; }
+
+  __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); }
+  __forceinline vllong8 operator *(const vllong8& a, long long      b) { return a * vllong8(b); }
+  __forceinline vllong8 operator *(long long      a, const vllong8& b) { return vllong8(a) * b; }
+
+  __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); }
+  __forceinline vllong8 operator &(const vllong8& a, long long      b) { return a & vllong8(b); }
+  __forceinline vllong8 operator &(long long      a, const vllong8& b) { return vllong8(a) & b; }
+
+  __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); }
+  __forceinline vllong8 operator |(const vllong8& a, long long      b) { return a | vllong8(b); }
+  __forceinline vllong8 operator |(long long      a, const vllong8& b) { return vllong8(a) | b; }
+
+  __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); }
+  __forceinline vllong8 operator ^(const vllong8& a, long long      b) { return a ^ vllong8(b); }
+  __forceinline vllong8 operator ^(long long      a, const vllong8& b) { return vllong8(a) ^ b; }
+
+  __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); }
+  __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); }
+
+  __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); }
+  __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); }
+
+  __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); }
+  __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); }
+  __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); }
+
+  __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); }
+  __forceinline vllong8 min(const vllong8& a, long long      b) { return min(a,vllong8(b)); }
+  __forceinline vllong8 min(long long      a, const vllong8& b) { return min(vllong8(a),b); }
+
+  __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); }
+  __forceinline vllong8 max(const vllong8& a, long long      b) { return max(a,vllong8(b)); }
+  __forceinline vllong8 max(long long      a, const vllong8& b) { return max(vllong8(a),b); }
+  
+  __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); }
+
+  __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8& operator +=(vllong8& a, const vllong8&  b) { return a = a + b; }
+  __forceinline vllong8& operator +=(vllong8& a, long long       b) { return a = a + b; }
+  
+  __forceinline vllong8& operator -=(vllong8& a, const vllong8&  b) { return a = a - b; }
+  __forceinline vllong8& operator -=(vllong8& a, long long       b) { return a = a - b; }
+
+  __forceinline vllong8& operator *=(vllong8& a, const vllong8&  b) { return a = a * b; }
+  __forceinline vllong8& operator *=(vllong8& a, long long       b) { return a = a * b; }
+  
+  __forceinline vllong8& operator &=(vllong8& a, const vllong8&  b) { return a = a & b; }
+  __forceinline vllong8& operator &=(vllong8& a, long long       b) { return a = a & b; }
+  
+  __forceinline vllong8& operator |=(vllong8& a, const vllong8&  b) { return a = a | b; }
+  __forceinline vllong8& operator |=(vllong8& a, long long       b) { return a = a | b; }
+  
+  __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; }
+  __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vllong8& a, long long      b) { return a == vllong8(b); }
+  __forceinline vboold8 operator ==(long long      a, const vllong8& b) { return vllong8(a) == b; }
+  
+  __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vllong8& a, long long      b) { return a != vllong8(b); }
+  __forceinline vboold8 operator !=(long long      a, const vllong8& b) { return vllong8(a) != b; }
+  
+  __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vllong8& a, long long      b) { return a <  vllong8(b); }
+  __forceinline vboold8 operator < (long long      a, const vllong8& b) { return vllong8(a) <  b; }
+  
+  __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vllong8& a, long long      b) { return a >= vllong8(b); }
+  __forceinline vboold8 operator >=(long long      a, const vllong8& b) { return vllong8(a) >= b; }
+
+  __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vllong8& a, long long      b) { return a >  vllong8(b); }
+  __forceinline vboold8 operator > (long long      a, const vllong8& b) { return vllong8(a) >  b; }
+
+  __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vllong8& a, long long      b) { return a <= vllong8(b); }
+  __forceinline vboold8 operator <=(long long      a, const vllong8& b) { return vllong8(a) <= b; }
+
+  __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+    
+  __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); }
+
+  __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) {
+    return _mm512_mask_or_epi64(f,m,t,t); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+  }
+
+  template<int i>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vllong8 shuffle4(const vllong8& v) {
+    return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+  }
+
+  template<int i>
+  __forceinline vllong8 shuffle4(const vllong8& v) {
+    return shuffle4<i, i>(v);
+  }
+
+  template<int i>
+  __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) {
+    return _mm512_alignr_epi64(a, b, i);
+  };
+
+  __forceinline long long toScalar(const vllong8& v) {
+    return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 vreduce_min2(vllong8 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+  __forceinline vllong8 vreduce_max2(vllong8 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+  __forceinline vllong8 vreduce_and2(vllong8 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); }
+
+  __forceinline vllong8 vreduce_or2(vllong8 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); }
+
+  __forceinline vllong8 vreduce_add2(vllong8 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+  __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); }
+  __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); }
+  __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 permute(const vllong8& v, const vllong8& index) {
+    return _mm512_permutexvar_epi64(index,v);  
+  }
+
+  __forceinline vllong8 reverse(const vllong8& a) {
+    return permute(a,vllong8(reverse_step));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<8; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vuint16_avx512.h
+++ b/thirdparty/embree/common/simd/vuint16_avx512.h
@@ -0,0 +1,424 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 16-wide AVX-512 unsigned integer type */
+  template<>
+  struct vuint<16>
+  {
+    ALIGNED_STRUCT_(64);   
+
+    typedef vboolf16 Bool;
+    typedef vuint16  UInt;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512i v; 
+      unsigned int i[16]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint16& t) { v = t.v; }
+    __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; }
+
+    __forceinline vuint(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vuint(unsigned int i) {
+      v = _mm512_set1_epi32(i);
+    }
+
+    __forceinline vuint(const vuint4& i) {
+      v = _mm512_broadcast_i32x4(i);
+    }
+
+    __forceinline vuint(const vuint8& i) {
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+    }
+    
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) {
+      v = _mm512_set4_epi32(d,c,b,a);      
+    }
+
+    __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3,
+                        unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7,
+                        unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11,
+                        unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15)
+    {
+      v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+   
+    __forceinline explicit vuint(const __m512& f) {
+      v = _mm512_cvtps_epu32(f);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {}
+    __forceinline vuint(OneTy)  : v(_mm512_set1_epi32(1)) {}
+    __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) {
+      _mm512_stream_si512((__m512i*)ptr,a);
+    }
+
+    static __forceinline vuint16 loadu(const void* addr)
+    {
+      return _mm512_loadu_si512(addr);
+    }
+
+    static __forceinline vuint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+    static __forceinline vuint16 load(const vuint16* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vuint16 load(const unsigned int* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); }
+
+
+    static __forceinline void store(void* ptr, const vuint16& v) {
+      _mm512_store_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vuint16& v) {
+      _mm512_storeu_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) {
+      _mm512_mask_storeu_epi32(ptr,mask,f);
+    }
+
+    static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) {
+      _mm512_mask_store_epi32(addr,mask,v2);
+    }
+
+    static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) {
+      return _mm512_mask_compress_epi32(v,mask,v);
+    }
+
+    static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) {
+      return _mm512_mask_compress_epi32(a,mask,b);
+    }
+
+    static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) {
+      return _mm512_mask_expand_epi32(b,mask,a);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) {
+      return _mm512_i32gather_epi32(index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) {
+      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) {
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 16); return i[index]; }
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+    __forceinline unsigned int uint    (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+    __forceinline size_t&      uint64_t(size_t index) const { assert(index < 8);  return ((size_t*)i)[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); }
+
+  __forceinline vuint16 operator +(const vuint16& a) { return a; }
+  __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vuint16 operator +(const vuint16& a, unsigned int   b) { return a + vuint16(b); }
+  __forceinline vuint16 operator +(unsigned int   a, const vuint16& b) { return vuint16(a) + b; }
+
+  __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vuint16 operator -(const vuint16& a, unsigned int   b) { return a - vuint16(b); }
+  __forceinline vuint16 operator -(unsigned int   a, const vuint16& b) { return vuint16(a) - b; }
+
+  __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); }
+  __forceinline vuint16 operator *(const vuint16& a, unsigned int   b) { return a * vuint16(b); }
+  __forceinline vuint16 operator *(unsigned int   a, const vuint16& b) { return vuint16(a) * b; }
+
+  __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vuint16 operator &(const vuint16& a, unsigned int   b) { return a & vuint16(b); }
+  __forceinline vuint16 operator &(unsigned int   a, const vuint16& b) { return vuint16(a) & b; }
+
+  __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vuint16 operator |(const vuint16& a, unsigned int   b) { return a | vuint16(b); }
+  __forceinline vuint16 operator |(unsigned int   a, const vuint16& b) { return vuint16(a) | b; }
+
+  __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vuint16 operator ^(const vuint16& a, unsigned int   b) { return a ^ vuint16(b); }
+  __forceinline vuint16 operator ^(unsigned int   a, const vuint16& b) { return vuint16(a) ^ b; }
+
+  __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); }
+  __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); }
+
+  __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); }
+  __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); }
+
+  __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); }
+  __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); }
+  __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); }
+  
+  __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vuint16 min(const vuint16& a, unsigned int   b) { return min(a,vuint16(b)); }
+  __forceinline vuint16 min(unsigned int   a, const vuint16& b) { return min(vuint16(a),b); }
+
+  __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); }
+  __forceinline vuint16 max(const vuint16& a, unsigned int   b) { return max(a,vuint16(b)); }
+  __forceinline vuint16 max(unsigned int   a, const vuint16& b) { return max(vuint16(a),b); }
+  
+  __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+  __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+  __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+  __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; }
+  __forceinline vuint16& operator +=(vuint16& a, unsigned int   b) { return a = a + b; }
+  
+  __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; }
+  __forceinline vuint16& operator -=(vuint16& a, unsigned int   b) { return a = a - b; }
+
+  __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; }
+  __forceinline vuint16& operator *=(vuint16& a, unsigned int   b) { return a = a * b; }
+  
+  __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; }
+  __forceinline vuint16& operator &=(vuint16& a, unsigned int   b) { return a = a & b; }
+  
+  __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; }
+  __forceinline vuint16& operator |=(vuint16& a, unsigned int   b) { return a = a | b; }
+  
+  __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; }
+  __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vuint16& a, unsigned int   b) { return a == vuint16(b); }
+  __forceinline vboolf16 operator ==(unsigned int   a, const vuint16& b) { return vuint16(a) == b; }
+  
+  __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vuint16& a, unsigned int   b) { return a != vuint16(b); }
+  __forceinline vboolf16 operator !=(unsigned int   a, const vuint16& b) { return vuint16(a) != b; }
+  
+  __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vuint16& a, unsigned int   b) { return a <  vuint16(b); }
+  __forceinline vboolf16 operator < (unsigned int   a, const vuint16& b) { return vuint16(a) <  b; }
+  
+  __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vuint16& a, unsigned int   b) { return a >= vuint16(b); }
+  __forceinline vboolf16 operator >=(unsigned int   a, const vuint16& b) { return vuint16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vuint16& a, unsigned int   b) { return a >  vuint16(b); }
+  __forceinline vboolf16 operator > (unsigned int   a, const vuint16& b) { return vuint16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vuint16& a, unsigned int   b) { return a <= vuint16(b); }
+  __forceinline vboolf16 operator <=(unsigned int   a, const vuint16& b) { return vuint16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+    
+ 
+  __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) {
+    return _mm512_mask_or_epi32(f,m,t,t); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i>
+  __forceinline vuint16 shuffle(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint16 shuffle(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vuint16 shuffle4(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint16 shuffle4(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) {
+    return _mm512_alignr_epi32(a, b, i);
+  };
+
+  __forceinline unsigned int toScalar(const vuint16& v) {
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16 vreduce_min2(vuint16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vuint16 vreduce_max2(vuint16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vuint16 vreduce_and2(vuint16 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+  __forceinline vuint16 vreduce_or2(vuint16 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+  __forceinline vuint16 vreduce_add2(vuint16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+  __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); }
+  __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); }
+  __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); }
+  __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vuint16 permute(vuint16 v, vuint16 index) {
+    return _mm512_permutexvar_epi32(index,v);  
+  }
+
+  __forceinline vuint16 reverse(const vuint16& a) {
+    return permute(a,vuint16(reverse_step));
+  }
+
+  __forceinline vuint16 prefix_sum(const vuint16& a) 
+  {
+    const vuint16 z(zero);
+    vuint16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vuint16 reverse_prefix_sum(const vuint16& a) 
+  {
+    const vuint16 z(zero);
+    vuint16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vuint4_sse2.h
+++ b/thirdparty/embree/common/simd/vuint4_sse2.h
@@ -0,0 +1,444 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/emath.h"
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE integer type */
+  template<>
+  struct vuint<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vuint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union { __m128i v; unsigned int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint4& a) { v = a.v; }
+    __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; }
+
+    __forceinline vuint(const __m128i a) : v(a) {}
+    __forceinline operator const __m128i&() const { return v; }
+    __forceinline operator       __m128i&()       { return v; }
+
+
+    __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {}
+#endif
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm_setzero_si128()) {}
+    __forceinline vuint(OneTy)    : v(_mm_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {}
+    __forceinline vuint(StepTy)   : v(_mm_set_epi32(3, 2, 1, 0)) {}
+    __forceinline vuint(TrueTy)   { v = _mm_cmpeq_epi32(v,v); }
+    __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+    static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+    static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+    
+#if defined(__AVX512VL__)
+    static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+#if defined(__aarch64__)
+    static __forceinline vuint4 load(const unsigned char* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+    static __forceinline vuint4 loadu(const unsigned char* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vuint4 load(const unsigned char* ptr) {
+      return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+    static __forceinline vuint4 loadu(const unsigned char* ptr) {
+      return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+#endif
+
+    static __forceinline vuint4 load(const unsigned short* ptr) {
+#if defined(__aarch64__)
+      return _mm_load4epu16_epi32(((__m128i*)ptr));
+#elif defined (__SSE4_1__)
+      return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+      return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+    } 
+
+    static __forceinline vuint4 load_nt(void* ptr) {
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr); 
+#else
+      return _mm_load_si128((__m128i*)ptr); 
+#endif
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vuint4& v) {
+#if !defined(__aarch64__) && defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
+#else
+      _mm_store_si128((__m128i*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_i32gather_epi32((const int*)ptr, index, scale);
+#else
+      return vuint4(
+          *(unsigned int*)(((char*)ptr)+scale*index[0]),
+          *(unsigned int*)(((char*)ptr)+scale*index[1]),
+          *(unsigned int*)(((char*)ptr)+scale*index[2]),
+          *(unsigned int*)(((char*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) {
+      vuint4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+
+    friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__SSE4_1__)
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+#else
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); }
+#else
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+  __forceinline vuint4 operator +(const vuint4& a) { return a; }
+  __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vuint4 operator +(const vuint4& a, unsigned int  b) { return a + vuint4(b); }
+  __forceinline vuint4 operator +(unsigned int  a, const vuint4& b) { return vuint4(a) + b; }
+
+  __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vuint4 operator -(const vuint4& a, unsigned int  b) { return a - vuint4(b); }
+  __forceinline vuint4 operator -(unsigned int  a, const vuint4& b) { return vuint4(a) - b; }
+
+//#if defined(__SSE4_1__)
+//  __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); }
+//#else
+//  __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+//#endif
+//  __forceinline vuint4 operator *(const vuint4& a, unsigned int  b) { return a * vuint4(b); }
+//  __forceinline vuint4 operator *(unsigned int  a, const vuint4& b) { return vuint4(a) * b; }
+
+  __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vuint4 operator &(const vuint4& a, unsigned int  b) { return a & vuint4(b); }
+  __forceinline vuint4 operator &(unsigned int  a, const vuint4& b) { return vuint4(a) & b; }
+
+  __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vuint4 operator |(const vuint4& a, unsigned int  b) { return a | vuint4(b); }
+  __forceinline vuint4 operator |(unsigned int  a, const vuint4& b) { return vuint4(a) | b; }
+
+  __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vuint4 operator ^(const vuint4& a, unsigned int  b) { return a ^ vuint4(b); }
+  __forceinline vuint4 operator ^(unsigned int  a, const vuint4& b) { return vuint4(a) ^ b; }
+
+  __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); }
+
+  __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); }
+  __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); }
+  __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; }
+  __forceinline vuint4& operator +=(vuint4& a, unsigned int  b) { return a = a + b; }
+  
+  __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; }
+  __forceinline vuint4& operator -=(vuint4& a, unsigned int  b) { return a = a - b; }
+
+//#if defined(__SSE4_1__)
+//  __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; }
+//  __forceinline vuint4& operator *=(vuint4& a, unsigned int  b) { return a = a * b; }
+//#endif
+  
+  __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; }
+  __forceinline vuint4& operator &=(vuint4& a, unsigned int  b) { return a = a & b; }
+  
+  __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; }
+  __forceinline vuint4& operator |=(vuint4& a, unsigned int  b) { return a = a | b; }
+  
+  __forceinline vuint4& operator <<=(vuint4& a, unsigned int  b) { return a = a << b; }
+  __forceinline vuint4& operator >>=(vuint4& a, unsigned int  b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); }
+  //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); }
+  //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a <  b); }
+  //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); }
+  //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a >  b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vuint4& a, unsigned int  b) { return a == vuint4(b); }
+  __forceinline vboolf4 operator ==(unsigned int  a, const vuint4& b) { return vuint4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vuint4& a, unsigned int  b) { return a != vuint4(b); }
+  __forceinline vboolf4 operator !=(unsigned int  a, const vuint4& b) { return vuint4(a) != b; }
+
+  //__forceinline vboolf4 operator < (const vuint4& a, unsigned int  b) { return a <  vuint4(b); }
+  //__forceinline vboolf4 operator < (unsigned int  a, const vuint4& b) { return vuint4(a) <  b; }
+
+  //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int  b) { return a >= vuint4(b); }
+  //__forceinline vboolf4 operator >=(unsigned int  a, const vuint4& b) { return vuint4(a) >= b; }
+
+  //__forceinline vboolf4 operator > (const vuint4& a, unsigned int  b) { return a >  vuint4(b); }
+  //__forceinline vboolf4 operator > (unsigned int  a, const vuint4& b) { return vuint4(a) >  b; }
+
+  //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int  b) { return a <= vuint4(b); }
+  //__forceinline vboolf4 operator <=(unsigned int  a, const vuint4& b) { return vuint4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; }
+  //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a <  b; }
+  //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; }
+  //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a >  b; }
+  //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+  //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+  //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+  //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); }
+  //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <  b); }
+  //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); }
+  //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >  b); }
+  //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+  __forceinline vuint4 select(const vuint4& t, const vuint4& f) {
+#if defined(__SSE4_1__) 
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+    return select(vboolf4(mask), t, f);
+#endif    
+  }
+
+/*#if defined(__SSE4_1__)
+  __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); }
+  __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+  __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); }
+  __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); }
+#endif
+
+  __forceinline vuint4 min(const vuint4& a, unsigned int  b) { return min(a,vuint4(b)); }
+  __forceinline vuint4 min(unsigned int  a, const vuint4& b) { return min(vuint4(a),b); }
+  __forceinline vuint4 max(const vuint4& a, unsigned int  b) { return max(a,vuint4(b)); }
+  __forceinline vuint4 max(unsigned int  a, const vuint4& b) { return max(vuint4(a),b); }*/
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+#if defined(__aarch64__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+  }
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+    return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+  }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+#endif
+#if defined(__SSE3__)
+  template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__SSE4_1__) && !defined(__aarch64__)
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
+#else
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+  template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
+
+  __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+#if defined(__SSE4_1__)
+
+  __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); }
+  __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); }
+  __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+  __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); }
+  __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); }
+  __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vuint8_avx.h
+++ b/thirdparty/embree/common/simd/vuint8_avx.h
@@ -0,0 +1,386 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vuint<8>
+  {
+    ALIGNED_STRUCT_(32);   
+
+    typedef vboolf8 Bool;
+    typedef vuint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };        // number of SIMD elements
+    union {                    // data
+      __m256i v;
+      struct { __m128i vl,vh; };
+      unsigned int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint8& a) { v = a.v; }
+    __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+    __forceinline vuint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+ 
+    __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm256_setzero_si256()) {}
+    __forceinline vuint(OneTy)    : v(_mm256_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {}
+    __forceinline vuint(StepTy)   : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+    static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+    static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
+
+    static __forceinline void store_nt(void* ptr, const vuint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline vuint8 load(const unsigned char* ptr) {
+      vuint4 il = vuint4::load(ptr+0);
+      vuint4 ih = vuint4::load(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 loadu(const unsigned char* ptr) {
+      vuint4 il = vuint4::loadu(ptr+0);
+      vuint4 ih = vuint4::loadu(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 load(const unsigned short* ptr) {
+      vuint4 il = vuint4::load(ptr+0);
+      vuint4 ih = vuint4::load(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 loadu(const unsigned short* ptr) {
+      vuint4 il = vuint4::loadu(ptr+0);
+      vuint4 ih = vuint4::loadu(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vuint8& i) {
+      vuint4 il(i.vl);
+      vuint4 ih(i.vh);
+      vuint4::store(ptr + 0,il);
+      vuint4::store(ptr + 4,ih);
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) {
+      return vuint8(
+          *(unsigned int*)(((char*)ptr)+scale*index[0]),
+          *(unsigned int*)(((char*)ptr)+scale*index[1]),
+          *(unsigned int*)(((char*)ptr)+scale*index[2]),
+          *(unsigned int*)(((char*)ptr)+scale*index[3]),
+          *(unsigned int*)(((char*)ptr)+scale*index[4]),
+          *(unsigned int*)(((char*)ptr)+scale*index[5]),
+          *(unsigned int*)(((char*)ptr)+scale*index[6]),
+          *(unsigned int*)(((char*)ptr)+scale*index[7]));
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) {
+      vuint8 r = zero;
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(unsigned int*)(((char*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(unsigned int*)(((char*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(unsigned int*)(((char*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(unsigned int*)(((char*)ptr)+scale*index[7]);
+      return r;
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+    {
+      *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+    {
+      if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+
+    static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+
+  __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+  __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
+  __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
+
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+  __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
+  __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
+
+  //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); }
+  //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
+  //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
+
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
+  __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
+
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
+  __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
+
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
+  __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
+
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); }
+
+  __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+  __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+  __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+  
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+  __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
+  __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
+
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+  __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
+  __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+  __forceinline vuint8& operator +=(vuint8& a, unsigned int          b) { return a = a + b; }
+  
+  __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+  __forceinline vuint8& operator -=(vuint8& a, unsigned int          b) { return a = a - b; }
+  
+  //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+  //__forceinline vuint8& operator *=(vuint8& a, unsigned int          b) { return a = a * b; }
+  
+  __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+  __forceinline vuint8& operator &=(vuint8& a, unsigned int          b) { return a = a & b; }
+  
+  __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+  __forceinline vuint8& operator |=(vuint8& a, unsigned int          b) { return a = a | b; }
+  
+  __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; }
+  __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+                                                                                       _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
+  __forceinline vboolf8 operator ==(unsigned int          a, const vuint8& b) { return vuint8(a) == b; }
+  
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+  __forceinline vboolf8 operator !=(const vuint8& a, unsigned int          b) { return a != vuint8(b); }
+  __forceinline vboolf8 operator !=(unsigned int          a, const vuint8& b) { return vuint8(a) != b; }
+  
+  //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)),
+  //                                                                                     _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); }
+  //__forceinline vboolf8 operator < (const vuint8& a, unsigned int          b) { return a <  vuint8(b); }
+  //__forceinline vboolf8 operator < (unsigned int          a, const vuint8& b) { return vuint8(a) <  b; }
+  
+  //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
+  //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int          b) { return a >= vuint8(b); }
+  //__forceinline vboolf8 operator >=(unsigned int          a, const vuint8& b) { return vuint8(a) >= b; }
+
+  //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)),
+  //                                                                                     _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); }
+  //__forceinline vboolf8 operator > (const vuint8& a, unsigned int          b) { return a >  vuint8(b); }
+  //__forceinline vboolf8 operator > (unsigned int          a, const vuint8& b) { return vuint8(a) >  b; }
+
+  //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
+  //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int          b) { return a <= vuint8(b); }
+  //__forceinline vboolf8 operator <=(unsigned int          a, const vuint8& b) { return vuint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+  template<int i>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+  //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+  //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/vuint8_avx2.h
+++ b/thirdparty/embree/common/simd/vuint8_avx2.h
@@ -0,0 +1,448 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vuint<8>
+  {
+    ALIGNED_STRUCT_(32);
+        
+    typedef vboolf8 Bool;
+    typedef vuint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m256i v;
+      unsigned int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint8& a) { v = a.v; }
+    __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+    __forceinline vuint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ 
+    __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm256_setzero_si256()) {}
+    __forceinline vuint(OneTy)    : v(_mm256_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {}
+    __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {}
+    __forceinline vuint(StepTy)   : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint8 load(const unsigned char* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+    static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+    static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+    static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) {
+      return _mm256_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) {
+      return _mm256_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+    
+    static __forceinline vuint8 load_nt(void* ptr) {
+      return _mm256_stream_load_si256((__m256i*)ptr);
+    }
+
+    static __forceinline void store_nt(void* ptr, const vuint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vuint8& i)
+    {
+      for (size_t j=0; j<8; j++)
+        ptr[j] = i[j];
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) {
+      return _mm256_i32gather_epi32((const int*) ptr, index, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) {
+      vuint8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale);
+#else
+      return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+      *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); }
+#else
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+  __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
+  __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
+
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
+  __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
+
+  //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); }
+  //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
+  //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
+
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
+  __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
+
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
+  __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
+
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
+  __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
+
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); }
+
+  __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); }
+  __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); }
+
+  __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); }
+  __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); }
+  __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); }
+
+  __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); }
+  __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); }
+  __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); }
+  
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
+  __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
+
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); }
+  __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
+  __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+  __forceinline vuint8& operator +=(vuint8& a, unsigned int          b) { return a = a + b; }
+  
+  __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+  __forceinline vuint8& operator -=(vuint8& a, unsigned int          b) { return a = a - b; }
+  
+  //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+  //__forceinline vuint8& operator *=(vuint8& a, unsigned int          b) { return a = a * b; }
+  
+  __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+  __forceinline vuint8& operator &=(vuint8& a, unsigned int          b) { return a = a & b; }
+  
+  __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+  __forceinline vuint8& operator |=(vuint8& a, unsigned int          b) { return a = a | b; }
+  
+  __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; }
+  __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+  }
+#else
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+  //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); }
+  //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
+  //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); }
+  //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+  }
+#endif
+
+  template<int mask>
+  __forceinline vuint8 select(const vuint8& t, const vuint8& f) {
+    return _mm256_blend_epi32(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
+  __forceinline vboolf8 operator ==(unsigned int          a, const vuint8& b) { return vuint8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vuint8& a, unsigned int          b) { return a != vuint8(b); }
+  __forceinline vboolf8 operator !=(unsigned int          a, const vuint8& b) { return vuint8(a) != b; }
+
+  //__forceinline vboolf8 operator < (const vuint8& a, unsigned int          b) { return a <  vuint8(b); }
+  //__forceinline vboolf8 operator < (unsigned int          a, const vuint8& b) { return vuint8(a) <  b; }
+
+  //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int          b) { return a >= vuint8(b); }
+  //__forceinline vboolf8 operator >=(unsigned int          a, const vuint8& b) { return vuint8(a) >= b; }
+
+  //__forceinline vboolf8 operator > (const vuint8& a, unsigned int          b) { return a >  vuint8(b); }
+  //__forceinline vboolf8 operator > (unsigned int          a, const vuint8& b) { return vuint8(a) >  b; }
+
+  //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int          b) { return a <= vuint8(b); }
+  //__forceinline vboolf8 operator <=(unsigned int          a, const vuint8& b) { return vuint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+  //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a <  b; }
+  //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; }
+  //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a >  b; }
+  //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+  //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <  b); }
+  //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); }
+  //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >  b); }
+  //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+  template<int i>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+#if !defined(__aarch64__)
+  __forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
+    return _mm256_permutevar8x32_epi32(v, index);
+  }
+
+  __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) {
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+  }
+
+  template<int i>
+  __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) {
+#if defined(__AVX512VL__)
+    return _mm256_alignr_epi32(a, b, i);    
+#else
+    return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+  }  
+#endif // !defined(__aarch64__)
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+  //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+  //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
--- a/thirdparty/embree/common/simd/wasm/emulation.h
+++ b/thirdparty/embree/common/simd/wasm/emulation.h
@@ -0,0 +1,13 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// According to https://emscripten.org/docs/porting/simd.html, _MM_SET_EXCEPTION_MASK and
+// _mm_setcsr are unavailable in WebAssembly.
+
+#define _MM_SET_EXCEPTION_MASK(x)
+
+__forceinline void _mm_setcsr(unsigned int)
+{
+}
--- a/thirdparty/embree/common/sys/alloc.cpp
+++ b/thirdparty/embree/common/sys/alloc.cpp
@@ -0,0 +1,377 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "mutex.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+  
+namespace embree
+{
+  void* alignedMalloc(size_t size, size_t align)
+  {
+    if (size == 0)
+      return nullptr;
+
+    assert((align & (align-1)) == 0);
+    void* ptr = _mm_malloc(size,align);
+    if (size != 0 && ptr == nullptr)
+      abort(); //throw std::bad_alloc();
+    return ptr;
+  }
+
+  void alignedFree(void* ptr)
+  {
+    if (ptr) {
+      _mm_free(ptr);
+    }
+  }
+
+#if defined(EMBREE_SYCL_SUPPORT)
+  
+  void* alignedSYCLMalloc(sycl::context* context, sycl::device* device, size_t size, size_t align, EmbreeUSMMode mode)
+  {
+    assert(context);
+    assert(device);
+    
+    if (size == 0)
+      return nullptr;
+
+    assert((align & (align-1)) == 0);
+
+    void* ptr = nullptr;
+    if (mode == EmbreeUSMMode::DEVICE_READ_ONLY)
+      ptr = sycl::aligned_alloc_shared(align,size,*device,*context,sycl::ext::oneapi::property::usm::device_read_only());
+    else
+      ptr = sycl::aligned_alloc_shared(align,size,*device,*context);
+
+    if (size != 0 && ptr == nullptr)
+      abort(); //throw std::bad_alloc();
+
+    return ptr;
+  }
+
+  void* alignedSYCLMalloc(sycl::context* context, sycl::device* device, size_t size, size_t align, EmbreeUSMMode mode, EmbreeMemoryType type)
+  {
+    assert(context);
+    assert(device);
+    
+    if (size == 0)
+      return nullptr;
+
+    assert((align & (align-1)) == 0);
+
+    void* ptr = nullptr;
+    if (type == EmbreeMemoryType::USM_SHARED) {
+      if (mode == EmbreeUSMMode::DEVICE_READ_ONLY)
+        ptr = sycl::aligned_alloc_shared(align,size,*device,*context,sycl::ext::oneapi::property::usm::device_read_only());
+      else
+        ptr = sycl::aligned_alloc_shared(align,size,*device,*context);
+    }
+    else if (type == EmbreeMemoryType::USM_HOST) {
+      ptr = sycl::aligned_alloc_host(align,size,*context);
+    }
+    else if (type == EmbreeMemoryType::USM_DEVICE) {
+      ptr = sycl::aligned_alloc_device(align,size,*device,*context);
+    }
+    else {
+      ptr = alignedMalloc(size,align);
+    }
+
+    if (size != 0 && ptr == nullptr)
+      abort(); //throw std::bad_alloc();
+
+    return ptr;
+  }
+  
+  void alignedSYCLFree(sycl::context* context, void* ptr)
+  {
+    assert(context);
+    if (ptr) {
+      sycl::usm::alloc type = sycl::get_pointer_type(ptr, *context);
+      if (type == sycl::usm::alloc::host || type == sycl::usm::alloc::device || type == sycl::usm::alloc::shared)
+        sycl::free(ptr,*context);
+      else {
+        alignedFree(ptr);
+      }
+    }
+  }
+
+#endif
+
+  static bool huge_pages_enabled = false;
+  static MutexSys os_init_mutex;
+
+  __forceinline bool isHugePageCandidate(const size_t bytes)
+  {
+    if (!huge_pages_enabled)
+      return false;
+
+    /* use huge pages only when memory overhead is low */
+    const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1);
+    return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <malloc.h>
+
+namespace embree
+{
+  bool win_enable_selockmemoryprivilege (bool verbose)
+  {
+    HANDLE hToken;
+    if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) {
+      if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+      return false;
+    }
+
+    TOKEN_PRIVILEGES tp;
+    tp.PrivilegeCount = 1;
+    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+
+    if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) {
+      if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+      return false;
+    }
+    
+    SetLastError(ERROR_SUCCESS);
+    if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) {
+      if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl;
+      return false;
+    }
+    
+    if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) {
+      if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl;
+      return false;
+    } 
+
+    return true;
+  }
+
+  bool os_init(bool hugepages, bool verbose) 
+  {
+    Lock<MutexSys> lock(os_init_mutex);
+
+    if (!hugepages) {
+      huge_pages_enabled = false;
+      return true;
+    }
+
+    if (GetLargePageMinimum() != PAGE_SIZE_2M) {
+      huge_pages_enabled = false;
+      return false;
+    }
+
+    huge_pages_enabled = true;
+    return true;
+  }
+
+  void* os_malloc(size_t bytes, bool& hugepages)
+  {
+    if (bytes == 0) {
+      hugepages = false;
+      return nullptr;
+    }
+
+    /* try direct huge page allocation first */
+    if (isHugePageCandidate(bytes)) 
+    {
+      int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES;
+      char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+      if (ptr != nullptr) {
+        hugepages = true;
+        return ptr;
+      }
+    } 
+
+    /* fall back to 4k pages */
+    int flags = MEM_COMMIT | MEM_RESERVE;
+    char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+    if (ptr == nullptr) abort(); //throw std::bad_alloc();
+    hugepages = false;
+    return ptr;
+  }
+
+  size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) 
+  {
+    if (hugepages) // decommitting huge pages seems not to work under Windows
+      return bytesOld;
+
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+    bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+    if (bytesNew >= bytesOld)
+      return bytesOld;
+
+    if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+      abort(); //throw std::bad_alloc();
+
+    return bytesNew;
+  }
+
+  void os_free(void* ptr, size_t bytes, bool hugepages) 
+  {
+    if (bytes == 0) 
+      return;
+
+    if (!VirtualFree(ptr,0,MEM_RELEASE))
+      abort(); //throw std::bad_alloc();
+  }
+
+  void os_advise(void *ptr, size_t bytes)
+  {
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sstream>
+
+#if defined(__MACOSX__)
+#include <mach/vm_statistics.h>
+#endif
+
+namespace embree
+{
+  bool os_init(bool hugepages, bool verbose) 
+  {
+    Lock<MutexSys> lock(os_init_mutex);
+
+    if (!hugepages) {
+      huge_pages_enabled = false;
+      return true;
+    }
+
+#if defined(__LINUX__)
+
+    int hugepagesize = 0;
+
+    std::ifstream file; 
+    file.open("/proc/meminfo",std::ios::in);
+    if (!file.is_open()) {
+      if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl;
+      huge_pages_enabled = false;
+      return false;
+    }
+    
+    std::string line;
+    while (getline(file,line))
+    {
+      std::stringstream sline(line);
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string tag; getline(sline,tag,' ');
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string val; getline(sline,val,' ');
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string unit; getline(sline,unit,' ');
+      if (tag == "Hugepagesize:" && unit == "kB") {
+	hugepagesize = std::stoi(val)*1024;
+	break;
+      }
+    }
+    
+    if (hugepagesize != PAGE_SIZE_2M) 
+    {
+      if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl;
+      huge_pages_enabled = false;
+      return false;
+    }
+#endif
+
+    huge_pages_enabled = true;
+    return true;
+  }
+
+  void* os_malloc(size_t bytes, bool& hugepages)
+  { 
+    if (bytes == 0) {
+      hugepages = false;
+      return nullptr;
+    }
+
+    /* try direct huge page allocation first */
+    if (isHugePageCandidate(bytes)) 
+    {
+#if defined(__MACOSX__)
+      void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
+      if (ptr != MAP_FAILED) {
+        hugepages = true;
+        return ptr;
+      }
+#elif defined(MAP_HUGETLB)
+      void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0);
+      if (ptr != MAP_FAILED) {
+        hugepages = true;
+        return ptr;
+      }
+#endif
+    } 
+
+    /* fallback to 4k pages */
+    void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    if (ptr == MAP_FAILED) abort(); //throw std::bad_alloc();
+    hugepages = false;
+
+    /* advise huge page hint for THP */
+    os_advise(ptr,bytes);
+    return ptr;
+  }
+
+  size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) 
+  {
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+    bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+    if (bytesNew >= bytesOld)
+      return bytesOld;
+
+    if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+      abort(); //throw std::bad_alloc();
+
+    return bytesNew;
+  }
+
+  void os_free(void* ptr, size_t bytes, bool hugepages) 
+  {
+    if (bytes == 0)
+      return;
+
+    /* for hugepages we need to also align the size */
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytes = (bytes+pageSize-1) & ~(pageSize-1);
+    if (munmap(ptr,bytes) == -1)
+      abort(); //throw std::bad_alloc();
+  }
+
+  /* hint for transparent huge pages (THP) */
+  void os_advise(void* pptr, size_t bytes)
+  {
+#if defined(MADV_HUGEPAGE)
+    madvise(pptr,bytes,MADV_HUGEPAGE); 
+#endif
+  }
+}
+
+#endif
--- a/thirdparty/embree/common/sys/alloc.h
+++ b/thirdparty/embree/common/sys/alloc.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include <vector>
+#include <set>
+
+namespace embree
+{
+#define ALIGNED_STRUCT_(align)                                            \
+  void* operator new(size_t size) { return alignedMalloc(size,align); }   \
+  void operator delete(void* ptr) { alignedFree(ptr); }                   \
+  void* operator new[](size_t size) { return alignedMalloc(size,align); } \
+  void operator delete[](void* ptr) { alignedFree(ptr); }
+  
+#define ALIGNED_CLASS_(align)                                          \
+ public:                                                               \
+    ALIGNED_STRUCT_(align)                                             \
+ private:
+
+  /*! aligned allocation */
+  void* alignedMalloc(size_t size, size_t align);
+  void alignedFree(void* ptr);
+
+
+  enum class EmbreeUSMMode {
+    DEFAULT = 0,
+    DEVICE_READ_WRITE = 0,
+    DEVICE_READ_ONLY = 1
+  };
+
+  enum class EmbreeMemoryType {
+    USM_HOST = 0,
+    USM_DEVICE = 1,
+    USM_SHARED = 2,
+    MALLOC = 3
+  };
+
+#if defined(EMBREE_SYCL_SUPPORT)
+
+  /*! aligned allocation using SYCL USM */
+  void* alignedSYCLMalloc(sycl::context* context, sycl::device* device, size_t size, size_t align, EmbreeUSMMode mode);
+  void* alignedSYCLMalloc(sycl::context* context, sycl::device* device, size_t size, size_t align, EmbreeUSMMode mode, EmbreeMemoryType type);
+  void alignedSYCLFree(sycl::context* context, void* ptr);
+
+#endif
+  
+  /*! allocator that performs aligned allocations */
+  template<typename T, size_t alignment>
+    struct aligned_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline pointer allocate( size_type n ) {
+        return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+        return alignedFree(p);
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+    };
+
+  /*! allocates pages directly from OS */
+  bool win_enable_selockmemoryprivilege(bool verbose);
+  bool os_init(bool hugepages, bool verbose);
+  void* os_malloc (size_t bytes, bool& hugepages);
+  size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages);
+  void  os_free   (void* ptr, size_t bytes, bool hugepages);
+  void  os_advise (void* ptr, size_t bytes);
+
+  /*! allocator that performs OS allocations */
+  template<typename T>
+    struct os_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline os_allocator () 
+        : hugepages(false) {}
+
+      __forceinline pointer allocate( size_type n ) {
+        return (pointer) os_malloc(n*sizeof(value_type),hugepages);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+        return os_free(p,n*sizeof(value_type),hugepages);
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+
+      bool hugepages;
+    };
+
+  /*! allocator that newer performs allocations */
+  template<typename T>
+    struct no_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline pointer allocate( size_type n ) {
+        abort(); //throw std::runtime_error("no allocation supported");
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+    };
+
+  /*! allocator for IDs */
+  template<typename T, size_t max_id>
+    struct IDPool
+    {
+      typedef T value_type;
+
+      IDPool ()
+      : nextID(0) {}
+
+      T allocate() 
+      {
+        /* return ID from list */
+        if (!IDs.empty()) 
+        {
+          T id = *IDs.begin();
+          IDs.erase(IDs.begin());
+          return id;
+        } 
+
+        /* allocate new ID */
+        else
+        {
+          if (size_t(nextID)+1 > max_id)
+            return -1;
+          
+          return nextID++;
+        }
+      }
+
+      /* adds an ID provided by the user */
+      bool add(T id)
+      {
+        if (id > max_id)
+          return false;
+        
+        /* check if ID should be in IDs set */
+        if (id < nextID) {
+          auto p = IDs.find(id);
+          if (p == IDs.end()) return false;
+          IDs.erase(p);
+          return true;
+        }
+
+        /* otherwise increase ID set */
+        else
+        {
+          for (T i=nextID; i<id; i++) {
+            IDs.insert(i);
+          }
+          nextID = id+1;
+          return true;
+        }
+      }
+
+      void deallocate( T id ) 
+      {
+        assert(id < nextID);
+        MAYBE_UNUSED auto done = IDs.insert(id).second;
+        assert(done);
+      }
+
+    private:
+      std::set<T> IDs;   //!< stores deallocated IDs to be reused
+      T nextID;          //!< next ID to use when IDs vector is empty
+    };
+}
+
--- a/thirdparty/embree/common/sys/array.h
+++ b/thirdparty/embree/common/sys/array.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "alloc.h"
+
+namespace embree
+{
+  /*! static array with static size */
+  template<typename T, size_t N>
+    class array_t
+    {
+    public:
+
+      /********************** Iterators  ****************************/
+
+      __forceinline T* begin() const { return items; };
+      __forceinline T* end  () const { return items+N; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return N == 0; }
+      __forceinline size_t size     () const { return N; }
+      __forceinline size_t max_size () const { return N; }
+            
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < N); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < N); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; }
+
+      __forceinline T& front() const { assert(N > 0); return items[0]; };
+      __forceinline T& back () const { assert(N > 0); return items[N-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+    private:
+      T items[N];
+    };
+
+  /*! static array with dynamic size */
+  template<typename T, size_t N>
+    class darray_t
+    {
+    public:
+
+      __forceinline darray_t () : M(0) {}
+
+      __forceinline darray_t (const T& v) : M(0) {
+        for (size_t i=0; i<N; i++) items[i] = v;
+      }
+
+      /********************** Iterators  ****************************/
+
+      __forceinline T* begin() const { return (T*)items; };
+      __forceinline T* end  () const { return (T*)items+M; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return M == 0; }
+      __forceinline size_t size     () const { return M; }
+      __forceinline size_t capacity () const { return N; }
+      __forceinline size_t max_size () const { return N; }
+      
+      void resize(size_t new_size) {
+        assert(new_size < max_size());
+        M = new_size;
+      }
+
+      /******************** Modifiers **************************/
+
+      __forceinline void push_back(const T& v) 
+      {
+        assert(M+1 < max_size());
+        items[M++] = v;
+      }
+
+      __forceinline void pop_back() 
+      {
+        assert(!empty());
+        M--;
+      }
+
+      __forceinline void clear() {
+        M = 0;
+      }
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < M); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < M); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < M); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; }
+
+      __forceinline T& front() { assert(M > 0); return items[0]; };
+      __forceinline T& back () { assert(M > 0); return items[M-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+    private:
+      size_t M;
+      T items[N];
+    };
+
+  /*! dynamic sized array that is allocated on the stack */
+#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray<Ty,max_stack_bytes> Name(N)
+  template<typename Ty, size_t max_stack_bytes>
+    struct __aligned(64) StackArray
+  {
+    __forceinline StackArray (const size_t N)
+      : N(N)
+    {
+      if (N*sizeof(Ty) <= max_stack_bytes) 
+        data = &arr[0];
+      else
+        data = (Ty*) alignedMalloc(N*sizeof(Ty),64); 
+    }
+
+    __forceinline ~StackArray () {
+      if (data != &arr[0]) alignedFree(data);
+    }
+
+    __forceinline operator       Ty* ()       { return data; }
+    __forceinline operator const Ty* () const { return data; }
+
+    __forceinline       Ty& operator[](const int i)       { assert(i>=0 && i<N); return data[i]; }
+    __forceinline const Ty& operator[](const int i) const { assert(i>=0 && i<N); return data[i]; }
+
+    __forceinline       Ty& operator[](const unsigned i)       { assert(i<N); return data[i]; }
+    __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
+
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
+    __forceinline       Ty& operator[](const size_t i)       { assert(i<N); return data[i]; }
+    __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
+#endif
+
+  private:
+    Ty arr[max_stack_bytes/sizeof(Ty)];
+    Ty* data;
+    size_t N;
+
+  private:
+    StackArray (const StackArray& other) DELETED; // do not implement
+    StackArray& operator= (const StackArray& other) DELETED; // do not implement
+
+  };
+
+  /*! dynamic sized array that is allocated on the stack */
+  template<typename Ty, size_t max_stack_elements, size_t max_total_elements>
+    struct __aligned(64) DynamicStackArray
+  {
+    __forceinline DynamicStackArray ()
+      : data(&arr[0]) {}
+
+    __forceinline ~DynamicStackArray ()
+    {
+      if (!isStackAllocated())
+        delete[] data;
+    }
+
+    __forceinline bool isStackAllocated() const {
+      return data == &arr[0];
+    }
+
+    __forceinline size_t size() const
+    {
+      if (isStackAllocated()) return max_stack_elements;
+      else return max_total_elements;
+    }
+
+    __forceinline void resize(size_t M)
+    {
+      assert(M <= max_total_elements);
+      if (likely(M <= max_stack_elements)) return;
+      if (likely(!isStackAllocated())) return;
+
+      data = new Ty[max_total_elements];
+      
+      for (size_t i=0; i<max_stack_elements; i++)
+        data[i] = arr[i];
+    }
+
+    __forceinline operator       Ty* ()       { return data; }
+    __forceinline operator const Ty* () const { return data; }
+
+    __forceinline       Ty& operator[](const int i)      { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
+    __forceinline       Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
+    __forceinline       Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+#endif
+
+    __forceinline DynamicStackArray (const DynamicStackArray& other)
+      : data(&arr[0]) 
+    {
+      for (size_t i=0; i<other.size(); i++)
+        this->operator[] (i) = other[i];
+    }
+     
+    DynamicStackArray& operator= (const DynamicStackArray& other)
+    {
+      for (size_t i=0; i<other.size(); i++)
+        this->operator[] (i) = other[i];
+
+      return *this;
+    }
+
+  private:
+    Ty arr[max_stack_elements];
+    Ty* data;
+  };
+}
--- a/thirdparty/embree/common/sys/atomic.h
+++ b/thirdparty/embree/common/sys/atomic.h
@@ -0,0 +1,59 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include "intrinsics.h"
+
+namespace embree
+{
+/* compiler memory barriers */
+#if defined(__INTEL_COMPILER)
+//#define __memory_barrier() __memory_barrier()
+#elif defined(__GNUC__) || defined(__clang__)
+#  define __memory_barrier() asm volatile("" ::: "memory")
+#elif  defined(_MSC_VER)
+#  define __memory_barrier() _ReadWriteBarrier()
+#endif
+
+  template <typename T>
+    struct atomic : public std::atomic<T>
+  {
+    atomic () {}
+      
+    atomic (const T& a)
+      : std::atomic<T>(a) {}
+
+    atomic (const atomic<T>& a) {
+      this->store(a.load());
+    }
+
+    atomic& operator=(const atomic<T>& other) {
+      this->store(other.load());
+      return *this;
+    }
+  };
+
+  template<typename T>
+    __forceinline void _atomic_min(std::atomic<T>& aref, const T& bref)
+  {
+    const T b = bref.load();
+    while (true) {
+      T a = aref.load();
+      if (a <= b) break;
+      if (aref.compare_exchange_strong(a,b)) break;
+    }
+  }
+
+  template<typename T>
+    __forceinline void _atomic_max(std::atomic<T>& aref, const T& bref)
+  {
+    const T b = bref.load();
+    while (true) {
+      T a = aref.load();
+      if (a >= b) break;
+      if (aref.compare_exchange_strong(a,b)) break;
+    }
+  }
+}
--- a/thirdparty/embree/common/sys/barrier.cpp
+++ b/thirdparty/embree/common/sys/barrier.cpp
@@ -0,0 +1,289 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "barrier.h"
+#include "condition.h"
+#include "regression.h"
+#include "thread.h"
+
+#if defined (__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  struct BarrierSysImplementation
+  {
+    __forceinline BarrierSysImplementation (size_t N) 
+      : i(0), enterCount(0), exitCount(0), barrierSize(0) 
+    {
+      events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+      events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+      init(N);
+    }
+    
+    __forceinline ~BarrierSysImplementation ()
+    {
+      CloseHandle(events[0]);
+      CloseHandle(events[1]);
+    }
+    
+    __forceinline void init(size_t N) 
+    {
+      barrierSize = N;
+      enterCount.store(N);
+      exitCount.store(N);
+    }
+
+    __forceinline void wait()
+    {
+      /* every thread entering the barrier decrements this count */
+      size_t i0 = i;
+      size_t cnt0 = enterCount--;
+
+      /* all threads except the last one are wait in the barrier */
+      if (cnt0 > 1) 
+      {
+        if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0)
+          THROW_RUNTIME_ERROR("WaitForSingleObjects failed");
+      }
+      
+      /* the last thread starts all threads waiting at the barrier */
+      else 
+      {
+        i = 1-i;
+        enterCount.store(barrierSize);
+        if (SetEvent(events[i0]) == 0)
+          THROW_RUNTIME_ERROR("SetEvent failed");
+      }
+
+      /* every thread leaving the barrier decrements this count */
+      size_t cnt1 = exitCount--;
+
+      /* the last thread that left the barrier resets the event again */
+      if (cnt1 == 1) 
+      {
+        exitCount.store(barrierSize);
+        if (ResetEvent(events[i0]) == 0)
+          THROW_RUNTIME_ERROR("ResetEvent failed");
+      }
+    }
+
+  public:
+    HANDLE events[2];
+    atomic<size_t> i;
+    atomic<size_t> enterCount;
+    atomic<size_t> exitCount;
+    size_t barrierSize;
+  };
+}
+
+#else
+
+namespace embree
+{
+  struct BarrierSysImplementation
+  {
+    __forceinline BarrierSysImplementation (size_t N) 
+      : count(0), barrierSize(0) 
+    {
+      init(N);
+    }
+    
+    __forceinline void init(size_t N) 
+    {
+      assert(count == 0);
+      count = 0;
+      barrierSize = N;
+    }
+
+    __forceinline void wait()
+    {
+      mutex.lock();
+      count++;
+      
+      if (count == barrierSize) {
+        count = 0;
+        cond.notify_all();
+        mutex.unlock();
+        return;
+      }
+      
+      cond.wait(mutex);
+      mutex.unlock();
+      return;
+    }
+
+  public:
+    MutexSys mutex;
+    ConditionSys cond;
+    volatile size_t count;
+    volatile size_t barrierSize;
+  };
+}
+
+#endif
+
+namespace embree
+{
+  BarrierSys::BarrierSys (size_t N) {
+    opaque = new BarrierSysImplementation(N);
+  }
+
+  BarrierSys::~BarrierSys () {
+    delete (BarrierSysImplementation*) opaque;
+  }
+
+  void BarrierSys::init(size_t count) {
+    ((BarrierSysImplementation*) opaque)->init(count);
+  }
+
+  void BarrierSys::wait() {
+    ((BarrierSysImplementation*) opaque)->wait();
+  }
+
+  LinearBarrierActive::LinearBarrierActive (size_t N) 
+    : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0)
+  { 
+    if (N == 0) N = getNumberOfLogicalThreads();
+    init(N);
+  }
+
+  LinearBarrierActive::~LinearBarrierActive() 
+  {
+    delete[] count0;
+    delete[] count1;
+  }
+
+  void LinearBarrierActive::init(size_t N) 
+  {
+    if (threadCount != N) {
+      threadCount = N;
+      if (count0) delete[] count0; count0 = new unsigned char[N];
+      if (count1) delete[] count1; count1 = new unsigned char[N];
+    }
+    mode      = 0;
+    flag0     = 0;
+    flag1     = 0;
+    for (size_t i=0; i<N; i++) count0[i] = 0;
+    for (size_t i=0; i<N; i++) count1[i] = 0;
+  }
+
+  void LinearBarrierActive::wait (const size_t threadIndex)
+  {
+    if (mode == 0)
+    {			
+      if (threadIndex == 0)
+      {	
+        for (size_t i=0; i<threadCount; i++)
+          count1[i] = 0;
+        
+        for (size_t i=1; i<threadCount; i++)
+        {
+          while (likely(count0[i] == 0)) 
+            pause_cpu();
+        }
+        mode  = 1;
+        flag1 = 0;
+        __memory_barrier();
+        flag0 = 1;
+      }			
+      else
+      {					
+        count0[threadIndex] = 1;
+        {
+          while (likely(flag0 == 0))
+            pause_cpu();
+        }
+        
+      }		
+    }					
+    else						
+    {
+      if (threadIndex == 0)
+      {	
+        for (size_t i=0; i<threadCount; i++)
+          count0[i] = 0;
+        
+        for (size_t i=1; i<threadCount; i++)
+        {		
+          while (likely(count1[i] == 0))
+            pause_cpu();
+        }
+        
+        mode  = 0;
+        flag0 = 0;
+        __memory_barrier();
+        flag1 = 1;
+      }			
+      else
+      {					
+        count1[threadIndex] = 1;
+        {
+          while (likely(flag1 == 0))
+            pause_cpu();
+        }
+      }		
+    }					
+  }
+
+  struct barrier_sys_regression_test : public RegressionTest
+  {
+    BarrierSys barrier;
+    std::atomic<size_t> threadID;
+    std::atomic<size_t> numFailed;
+    std::vector<size_t> threadResults;
+
+    barrier_sys_regression_test() 
+      : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0)
+    {
+      registerRegressionTest(this);
+    }
+
+    static void thread_alloc(barrier_sys_regression_test* This)
+    {
+      size_t tid = This->threadID++;
+      for (size_t j=0; j<1000; j++)
+      {
+        This->barrier.wait();
+        This->threadResults[tid] = tid;
+        This->barrier.wait();
+      }
+    }
+    
+    bool run ()
+    {
+      threadID.store(0);
+      numFailed.store(0);
+
+      size_t numThreads = getNumberOfLogicalThreads();
+      threadResults.resize(numThreads);
+      barrier.init(numThreads+1);
+
+      /* create threads */
+      std::vector<thread_t> threads;
+      for (size_t i=0; i<numThreads; i++)
+        threads.push_back(createThread((thread_func)thread_alloc,this));
+
+      /* run test */ 
+      for (size_t i=0; i<1000; i++)
+      {
+        for (size_t i=0; i<numThreads; i++) threadResults[i] = 0;
+        barrier.wait();
+        barrier.wait();
+        for (size_t i=0; i<numThreads; i++) numFailed += threadResults[i] != i;
+      }
+
+      /* destroy threads */
+      for (size_t i=0; i<numThreads; i++)
+        join(threads[i]);
+
+      return numFailed == 0;
+    }
+  };
+
+  barrier_sys_regression_test barrier_sys_regression_test;
+}
+
+
--- a/thirdparty/embree/common/sys/barrier.h
+++ b/thirdparty/embree/common/sys/barrier.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "atomic.h"
+
+namespace embree
+{
+  /*! system barrier using operating system */
+  class BarrierSys
+  {
+  public:
+
+    /*! construction / destruction */
+    BarrierSys (size_t N = 0);
+    ~BarrierSys ();
+
+  private:
+    /*! class in non-copyable */
+    BarrierSys (const BarrierSys& other) DELETED; // do not implement
+    BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement
+
+  public:
+    /*! initializes the barrier with some number of threads */
+    void init(size_t count);
+
+    /*! lets calling thread wait in barrier */
+    void wait();
+
+  private:
+    void* opaque;
+  };
+
+  /*! fast active barrier using atomic counter */
+  struct BarrierActive 
+  {
+  public:
+    BarrierActive () 
+      : cntr(0) {}
+    
+    void reset() {
+      cntr.store(0);
+    }
+
+    void wait (size_t numThreads) 
+    {
+      cntr++;
+      while (cntr.load() != numThreads) 
+        pause_cpu();
+    }
+
+  private:
+    std::atomic<size_t> cntr;
+  };
+
+  /*! fast active barrier that does not require initialization to some number of threads */
+  struct BarrierActiveAutoReset
+  {
+  public:
+    BarrierActiveAutoReset () 
+      : cntr0(0), cntr1(0) {}
+
+    void wait (size_t threadCount) 
+    {
+      cntr0.fetch_add(1);
+      while (cntr0 != threadCount) pause_cpu();
+      cntr1.fetch_add(1);
+      while (cntr1 != threadCount) pause_cpu();
+      cntr0.fetch_add(-1);
+      while (cntr0 != 0) pause_cpu();
+      cntr1.fetch_add(-1);
+      while (cntr1 != 0) pause_cpu();
+    }
+
+  private:
+    std::atomic<size_t> cntr0;
+    std::atomic<size_t> cntr1;
+  };
+
+  class LinearBarrierActive
+  {
+  public:
+
+    /*! construction and destruction */
+    LinearBarrierActive (size_t threadCount = 0);
+    ~LinearBarrierActive();
+    
+  private:
+    /*! class in non-copyable */
+    LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement
+    LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement
+
+  public:
+    /*! initializes the barrier with some number of threads */
+    void init(size_t threadCount);
+    
+    /*! thread with threadIndex waits in the barrier */
+    void wait (const size_t threadIndex);
+    
+  private:
+    volatile unsigned char* count0;
+    volatile unsigned char* count1; 
+    volatile unsigned int mode;
+    volatile unsigned int flag0;
+    volatile unsigned int flag1;
+    volatile size_t threadCount;
+  };
+}
+
--- a/thirdparty/embree/common/sys/condition.cpp
+++ b/thirdparty/embree/common/sys/condition.cpp
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "condition.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  struct ConditionImplementation
+  {
+    __forceinline ConditionImplementation () {
+      InitializeConditionVariable(&cond);
+    }
+
+    __forceinline ~ConditionImplementation () {
+    }
+
+    __forceinline void wait(MutexSys& mutex_in) {
+      SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE);
+    }
+
+    __forceinline void notify_all() {
+      WakeAllConditionVariable(&cond);
+    }
+
+  public:
+    CONDITION_VARIABLE cond;
+  };
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+  struct ConditionImplementation
+  {
+    __forceinline ConditionImplementation () { 
+      if (pthread_cond_init(&cond,nullptr) != 0)
+        THROW_RUNTIME_ERROR("pthread_cond_init failed");
+    }
+    
+    __forceinline ~ConditionImplementation() { 
+      MAYBE_UNUSED bool ok = pthread_cond_destroy(&cond) == 0;
+      assert(ok);
+    }
+    
+    __forceinline void wait(MutexSys& mutex) { 
+      if (pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex) != 0)
+        THROW_RUNTIME_ERROR("pthread_cond_wait failed");
+    }
+    
+    __forceinline void notify_all() { 
+      if (pthread_cond_broadcast(&cond) != 0)
+        THROW_RUNTIME_ERROR("pthread_cond_broadcast failed");
+    }
+    
+  public:
+    pthread_cond_t cond;
+  };
+}
+#endif
+
+namespace embree 
+{
+  ConditionSys::ConditionSys () { 
+    cond = new ConditionImplementation; 
+  }
+
+  ConditionSys::~ConditionSys() { 
+    delete (ConditionImplementation*) cond;
+  }
+
+  void ConditionSys::wait(MutexSys& mutex) { 
+    ((ConditionImplementation*) cond)->wait(mutex);
+  }
+
+  void ConditionSys::notify_all() { 
+    ((ConditionImplementation*) cond)->notify_all();
+  }
+}
--- a/thirdparty/embree/common/sys/condition.h
+++ b/thirdparty/embree/common/sys/condition.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "mutex.h"
+
+namespace embree
+{
+  class ConditionSys
+  {
+  public:
+    ConditionSys();
+    ~ConditionSys();
+    void wait( class MutexSys& mutex );
+    void notify_all();
+
+    template<typename Predicate>
+      __forceinline void wait( class MutexSys& mutex, const Predicate& pred )
+    {
+      while (!pred()) wait(mutex);
+    }
+
+  private:
+    ConditionSys (const ConditionSys& other) DELETED; // do not implement
+    ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement
+
+  protected:
+    void* cond;
+  };
+}
--- a/thirdparty/embree/common/sys/estring.cpp
+++ b/thirdparty/embree/common/sys/estring.cpp
@@ -0,0 +1,42 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "estring.h"
+
+#include <algorithm>
+#include <ctype.h>
+
+namespace embree
+{
+  char to_lower(char c) { return char(tolower(int(c))); }
+  char to_upper(char c) { return char(toupper(int(c))); }
+  std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; }
+  std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; }
+
+  Vec2f string_to_Vec2f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next);
+    return Vec2f(x,y);
+  }
+  
+  Vec3f string_to_Vec3f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next); str = str.substr(next+1);
+    const float z = std::stof(str,&next); 
+    return Vec3f(x,y,z);
+  }
+  
+  Vec4f string_to_Vec4f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next); str = str.substr(next+1);
+    const float z = std::stof(str,&next); str = str.substr(next+1);
+    const float w = std::stof(str,&next);
+    return Vec4f(x,y,z,w);
+  }
+}
--- a/thirdparty/embree/common/sys/estring.h
+++ b/thirdparty/embree/common/sys/estring.h
@@ -0,0 +1,73 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/vec4.h"
+
+namespace embree
+{
+  class IOStreamStateRestorer 
+  {
+  public:
+    IOStreamStateRestorer(std::ostream& iostream)
+      : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) {
+    }
+
+    ~IOStreamStateRestorer() {
+      iostream.flags(flags);
+      iostream.precision(precision);
+    }
+    
+  private:
+    std::ostream& iostream;
+    std::ios::fmtflags flags;
+    std::streamsize precision;
+  };
+
+  struct IndentOStream : public std::streambuf
+  {
+    explicit IndentOStream(std::ostream &ostream, int indent = 2)
+      : streambuf(ostream.rdbuf())
+      , start_of_line(true)
+      , ident_str(indent, ' ')
+      , stream(&ostream)
+    {
+      // set streambuf of ostream to this and save original streambuf
+      stream->rdbuf(this);
+    }
+
+    virtual ~IndentOStream()
+    {
+      if (stream != NULL) {
+        // restore old streambuf
+        stream->rdbuf(streambuf);
+      }
+    }
+
+  protected:
+    virtual int overflow(int ch) {
+      if (start_of_line && ch != '\n') {
+        streambuf->sputn(ident_str.data(), ident_str.size());
+      }
+      start_of_line = ch == '\n';
+      return streambuf->sputc(ch);
+    }
+
+  private:
+    std::streambuf *streambuf;
+    bool start_of_line;
+    std::string ident_str;
+    std::ostream *stream;
+  };
+
+  std::string toLowerCase(const std::string& s);
+  std::string toUpperCase(const std::string& s);
+
+  Vec2f string_to_Vec2f ( std::string str );
+  Vec3f string_to_Vec3f ( std::string str );
+  Vec4f string_to_Vec4f ( std::string str );
+}
--- a/thirdparty/embree/common/sys/filename.cpp
+++ b/thirdparty/embree/common/sys/filename.cpp
@@ -0,0 +1,126 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "filename.h"
+#include "sysinfo.h"
+
+namespace embree
+{
+#ifdef __WIN32__
+  const char path_sep = '\\';
+#else
+  const char path_sep = '/';
+#endif
+
+  /*! create an empty filename */
+  FileName::FileName () {}
+
+  /*! create a valid filename from a string */
+  FileName::FileName (const char* in) {
+    filename = in;
+    for (size_t i=0; i<filename.size(); i++)
+      if (filename[i] == '\\' || filename[i] == '/')
+        filename[i] = path_sep;
+    while (!filename.empty() && filename[filename.size()-1] == path_sep)
+      filename.resize(filename.size()-1);
+  }
+
+  /*! create a valid filename from a string */
+  FileName::FileName (const std::string& in) {
+    filename = in;
+    for (size_t i=0; i<filename.size(); i++)
+      if (filename[i] == '\\' || filename[i] == '/')
+        filename[i] = path_sep;
+    while (!filename.empty() && filename[filename.size()-1] == path_sep)
+      filename.resize(filename.size()-1);
+  }
+  
+  /*! returns path to executable */
+  FileName FileName::executableFolder() {
+    return FileName(getExecutableFileName()).path();
+  }
+
+  /*! returns the path */
+  FileName FileName::path() const {
+    size_t pos = filename.find_last_of(path_sep);
+    if (pos == std::string::npos) return FileName();
+    return filename.substr(0,pos);
+  }
+
+  /*! returns the basename */
+  std::string FileName::base() const {
+    size_t pos = filename.find_last_of(path_sep);
+    if (pos == std::string::npos) return filename;
+    return filename.substr(pos+1);
+  }
+
+  /*! returns the extension */
+  std::string FileName::ext() const {
+    size_t pos = filename.find_last_of('.');
+    if (pos == std::string::npos) return "";
+    return filename.substr(pos+1);
+  }
+
+  /*! returns the extension */
+  FileName FileName::dropExt() const {
+    size_t pos = filename.find_last_of('.');
+    if (pos == std::string::npos) return filename;
+    return filename.substr(0,pos);
+  }
+
+  /*! returns the basename without extension */
+  std::string FileName::name() const {
+    size_t start = filename.find_last_of(path_sep);
+    if (start == std::string::npos) start = 0; else start++;
+    size_t end = filename.find_last_of('.');
+    if (end == std::string::npos || end < start) end = filename.size();
+    return filename.substr(start, end - start);
+  }
+
+  /*! replaces the extension */
+  FileName FileName::setExt(const std::string& ext) const {
+    size_t start = filename.find_last_of(path_sep);
+    if (start == std::string::npos) start = 0; else start++;
+    size_t end = filename.find_last_of('.');
+    if (end == std::string::npos || end < start) return FileName(filename+ext);
+    return FileName(filename.substr(0,end)+ext);
+  }
+
+  /*! adds the extension */
+  FileName FileName::addExt(const std::string& ext) const {
+    return FileName(filename+ext);
+  }
+
+  /*! concatenates two filenames to this/other */
+  FileName FileName::operator +( const FileName& other ) const {
+    if (filename == "") return FileName(other);
+    else return FileName(filename + path_sep + other.filename);
+  }
+
+  /*! concatenates two filenames to this/other */
+  FileName FileName::operator +( const std::string& other ) const {
+    return operator+(FileName(other));
+  }
+
+  /*! removes the base from a filename (if possible) */
+  FileName FileName::operator -( const FileName& base ) const {
+    size_t pos = filename.find_first_of(base);
+    if (pos == std::string::npos) return *this;
+    return FileName(filename.substr(pos+1));
+  }
+
+  /*! == operator */
+  bool operator== (const FileName& a, const FileName& b) {
+    return a.filename == b.filename;
+  }
+  
+  /*! != operator */
+  bool operator!= (const FileName& a, const FileName& b) {
+    return a.filename != b.filename;
+  }
+
+  /*! output operator */
+  std::ostream& operator<<(std::ostream& cout, const FileName& filename) {
+    return cout << filename.filename;
+  }
+}
--- a/thirdparty/embree/common/sys/filename.h
+++ b/thirdparty/embree/common/sys/filename.h
@@ -0,0 +1,78 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+  /*! Convenience class for handling file names and paths. */
+  class FileName
+  {
+  public:
+
+    /*! create an empty filename */
+    FileName ();
+
+    /*! create a valid filename from a string */
+    FileName (const char* filename);
+
+    /*! create a valid filename from a string */
+    FileName (const std::string& filename);
+
+    /*! returns path to executable */
+    static FileName executableFolder();
+
+    /*! auto convert into a string */
+    operator std::string() const { return filename; }
+
+    /*! returns a string of the filename */
+    const std::string str() const { return filename; }
+
+    /*! returns a c-string of the filename */
+    const char* c_str() const { return filename.c_str(); }
+
+    /*! returns the path of a filename */
+    FileName path() const;
+
+    /*! returns the file of a filename  */
+    std::string base() const;
+
+    /*! returns the base of a filename without extension */
+    std::string name() const;
+
+    /*! returns the file extension */
+    std::string ext() const;
+
+    /*! drops the file extension */
+    FileName dropExt() const;
+
+    /*! replaces the file extension */
+    FileName setExt(const std::string& ext = "") const;
+
+    /*! adds file extension */
+    FileName addExt(const std::string& ext = "") const;
+
+    /*! concatenates two filenames to this/other */
+    FileName operator +( const FileName& other ) const;
+
+    /*! concatenates two filenames to this/other */
+    FileName operator +( const std::string& other ) const;
+
+    /*! removes the base from a filename (if possible) */
+    FileName operator -( const FileName& base ) const;
+
+    /*! == operator */
+    friend bool operator==(const FileName& a, const FileName& b);
+
+    /*! != operator */
+    friend bool operator!=(const FileName& a, const FileName& b);
+
+    /*! output operator */
+    friend std::ostream& operator<<(std::ostream& cout, const FileName& filename);
+   
+  private:
+    std::string filename;
+  };
+}
--- a/thirdparty/embree/common/sys/intrinsics.h
+++ b/thirdparty/embree/common/sys/intrinsics.h
@@ -0,0 +1,566 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#if defined(__WIN32__)
+#include <intrin.h>
+#endif
+
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <immintrin.h>
+#if defined(__EMSCRIPTEN__)
+#include "../simd/wasm/emulation.h"
+#endif
+#endif
+
+#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
+  #if !defined(_tzcnt_u32)
+    #define _tzcnt_u32 __tzcnt_u32
+  #endif
+  #if !defined(_tzcnt_u64)
+    #define _tzcnt_u64 __tzcnt_u64
+  #endif
+#endif
+
+#if defined(__aarch64__)
+  #if !defined(_lzcnt_u32)
+    #define _lzcnt_u32 __builtin_clz
+  #endif
+#else
+  #if defined(__LZCNT__)
+    #if !defined(_lzcnt_u32)
+      #define _lzcnt_u32 __lzcnt32
+    #endif
+    #if !defined(_lzcnt_u64)
+      #define _lzcnt_u64 __lzcnt64
+    #endif
+  #endif
+#endif
+
+#if defined(__WIN32__)
+#  if !defined(NOMINMAX)
+#    define NOMINMAX
+#  endif
+#  include <windows.h>
+#endif
+
+/* normally defined in pmmintrin.h, but we always need this */
+#if !defined(_MM_SET_DENORMALS_ZERO_MODE)
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+#endif
+
+namespace embree
+{
+  
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+  
+#if defined(__WIN32__) && !defined(__INTEL_LLVM_COMPILER)
+  
+  __forceinline size_t read_tsc()  
+  {
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    return (size_t)li.QuadPart;
+  }
+  
+  __forceinline int bsf(int v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _tzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+  }
+  
+  __forceinline unsigned bsf(unsigned v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _tzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+  }
+  
+#if defined(__X86_64__) || defined (__aarch64__)
+  __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__) 
+    return _tzcnt_u64(v);
+#else
+    unsigned long r = 0; _BitScanForward64(&r,v); return r;
+#endif
+  }
+#endif
+  
+  __forceinline int bscf(int& v) 
+  {
+    int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+  __forceinline unsigned bscf(unsigned& v) 
+  {
+    unsigned i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+#if defined(__X86_64__) || defined (__aarch64__)
+  __forceinline size_t bscf(size_t& v) 
+  {
+    size_t i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+#endif
+  
+  __forceinline int bsr(int v) {
+#if defined(__AVX2__)  && !defined(__aarch64__)
+    return 31 - _lzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+  }
+  
+  __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return 31 - _lzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+  }
+  
+#if defined(__X86_64__) || defined (__aarch64__)
+  __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__) 
+    return 63 -_lzcnt_u64(v);
+#else
+    unsigned long r = 0; _BitScanReverse64(&r, v); return r;
+#endif
+  }
+#endif
+  
+  __forceinline int lzcnt(const int x)
+  {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _lzcnt_u32(x);
+#else
+    if (unlikely(x == 0)) return 32;
+    return 31 - bsr(x);    
+#endif
+  }
+  
+  __forceinline int btc(int v, int i) {
+    long r = v; _bittestandcomplement(&r,i); return r;
+  }
+  
+  __forceinline int bts(int v, int i) {
+    long r = v; _bittestandset(&r,i); return r;
+  }
+  
+  __forceinline int btr(int v, int i) {
+    long r = v; _bittestandreset(&r,i); return r;
+  }
+  
+#if defined(__X86_64__)
+  
+  __forceinline size_t btc(size_t v, size_t i) {
+    size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
+  }
+  
+  __forceinline size_t bts(size_t v, size_t i) {
+    __int64 r = v; _bittestandset64(&r,i); return r;
+  }
+  
+  __forceinline size_t btr(size_t v, size_t i) {
+    __int64 r = v; _bittestandreset64(&r,i); return r;
+  }
+  
+#endif
+  
+  __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
+    return _InterlockedCompareExchange((volatile long*)p,v,c);
+  }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+  
+#else
+  
+  __forceinline uint64_t read_tsc()  {
+#if defined(__X86_ASM__)
+    uint32_t high,low;
+    asm volatile ("rdtsc" : "=d"(high), "=a"(low));
+    return (((uint64_t)high) << 32) + (uint64_t)low;
+#else
+    /* Not supported yet, meaning measuring traversal cost per pixel does not work. */
+    return 0;
+#endif
+  }
+  
+  __forceinline int bsf(int v) {
+#if defined(__ARM_NEON)
+    return __builtin_ctz(v);
+#else
+#if defined(__AVX2__)
+    return _tzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_ctz(v);
+#endif
+#endif
+  }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+  __forceinline unsigned int   bsf(unsigned v) {
+    return sycl::ctz(v);
+  }
+
+#else
+  
+#if defined(__64BIT__)
+  __forceinline unsigned bsf(unsigned v) 
+  {
+#if defined(__ARM_NEON)
+    return __builtin_ctz(v);
+#else
+#if defined(__AVX2__)
+    return _tzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_ctz(v);
+#endif
+#endif
+  }
+#endif
+#endif
+  
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+  __forceinline size_t bsf(size_t v) {
+    return sycl::ctz(v);
+  }
+#else
+  
+  __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__X86_64__)
+    return _tzcnt_u64(v);
+#else
+    return _tzcnt_u32(v);
+#endif
+#elif defined(__X86_ASM__)
+    size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_ctzl(v);
+#endif
+  }
+#endif
+
+  __forceinline int bscf(int& v) 
+  {
+    int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+#if defined(__64BIT__)
+  __forceinline unsigned int bscf(unsigned int& v) 
+  {
+    unsigned int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+#endif
+  
+  __forceinline size_t bscf(size_t& v) 
+  {
+    size_t i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+  __forceinline int bsr(int v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return 31 - _lzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_clz(v) ^ 31;
+#endif
+  }
+  
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
+  __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__) 
+    return 31 - _lzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_clz(v) ^ 31;
+#endif
+  }
+#endif
+  
+  __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__X86_64__)
+    return 63 - _lzcnt_u64(v);
+#else
+    return 31 - _lzcnt_u32(v);
+#endif
+#elif defined(__X86_ASM__)
+    size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
+#endif
+  }
+  
+  __forceinline int lzcnt(const int x)
+  {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _lzcnt_u32(x);
+#else
+    if (unlikely(x == 0)) return 32;
+    return 31 - bsr(x);    
+#endif
+  }
+
+  __forceinline size_t blsr(size_t v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+  #if defined(__INTEL_COMPILER)
+    return _blsr_u64(v);
+  #else
+    #if defined(__X86_64__)
+       return __blsr_u64(v);
+    #else
+       return __blsr_u32(v);
+    #endif
+  #endif
+#else
+       return v & (v-1);
+#endif
+  }
+  
+  __forceinline int btc(int v, int i) {
+#if defined(__X86_ASM__)
+    int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#else
+    return (v ^ (1 << i));
+#endif
+  }
+  
+  __forceinline int bts(int v, int i) {
+#if defined(__X86_ASM__)
+    int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v | (1 << i));
+#endif
+  }
+  
+  __forceinline int btr(int v, int i) {
+#if defined(__X86_ASM__)
+    int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v & ~(1 << i));
+#endif
+  }
+  
+  __forceinline size_t btc(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+    size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#else
+    return (v ^ (1 << i));
+#endif
+  }
+  
+  __forceinline size_t bts(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+    size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v | (1 << i));
+#endif
+  }
+  
+  __forceinline size_t btr(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+    size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v & ~(1 << i));
+#endif
+  }
+
+  __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
+    return __sync_val_compare_and_swap(value, comparand, input);
+  }
+  
+#endif
+  
+#if !defined(__WIN32__)
+
+#if defined(__i386__) && defined(__PIC__)
+
+  __forceinline void __cpuid(int out[4], int op)
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                  : "0"(op));
+  }
+
+  __forceinline void __cpuid_count(int out[4], int op1, int op2)
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
+                  : "0" (op1), "2" (op2));
+  }
+
+#elif defined(__X86_ASM__)
+
+  __forceinline void __cpuid(int out[4], int op) {
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
+  }
+
+  __forceinline void __cpuid_count(int out[4], int op1, int op2) {
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
+  }
+
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+  
+#if defined(__clang__) || defined(__GNUC__)
+#if !defined(_mm_undefined_ps)
+  __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
+#endif
+#if !defined(_mm_undefined_si128)
+  __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
+#endif
+#if !defined(_mm256_undefined_ps) && defined(__AVX__)
+  __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
+#endif
+#if !defined(_mm256_undefined_si256) && defined(__AVX__)
+  __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
+#endif
+#if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
+  __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
+#endif
+#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
+  __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
+#endif
+#endif
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+
+  __forceinline unsigned int popcnt(unsigned int in) {
+    return sycl::popcount(in);
+  }
+  
+#else
+  
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+  
+  __forceinline int popcnt(int in) {
+    return _mm_popcnt_u32(in);
+  }
+  
+  __forceinline unsigned popcnt(unsigned in) {
+    return _mm_popcnt_u32(in);
+  }
+  
+#if defined(__64BIT__)
+  __forceinline size_t popcnt(size_t in) {
+    return _mm_popcnt_u64(in);
+  }
+#endif
+  
+#endif
+  
+#endif
+
+#if defined(__X86_ASM__)
+  __forceinline uint64_t rdtsc()
+  {
+    int dummy[4]; 
+    __cpuid(dummy,0); 
+    uint64_t clock = read_tsc(); 
+    __cpuid(dummy,0); 
+    return clock;
+  }
+#endif
+  
+  __forceinline void pause_cpu(const size_t N = 8)
+  {
+    for (size_t i=0; i<N; i++)
+      _mm_pause();    
+  }
+  
+  /* prefetches */
+  __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
+  __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
+  __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
+  __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
+  __forceinline void prefetchEX (const void* ptr) {
+#if defined(__INTEL_COMPILER)
+    _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
+#else
+    _mm_prefetch((const char*)ptr,_MM_HINT_T0);    
+#endif
+  }
+
+  __forceinline void prefetchL1EX(const void* ptr) {
+    prefetchEX(ptr);
+  }
+
+  __forceinline void prefetchL2EX(const void* ptr) {
+    prefetchEX(ptr);
+  }
+#if defined(__AVX2__) && !defined(__aarch64__)
+   __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
+   __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
+#if defined(__X86_64__)
+   __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
+   __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
+#endif
+#endif
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER)
+   __forceinline float mm512_cvtss_f32(__m512 v) {
+     return _mm512_cvtss_f32(v);
+   }
+   __forceinline int mm512_mask2int(__mmask16 k1) {
+     return _mm512_mask2int(k1);
+   }
+   __forceinline __mmask16 mm512_int2mask(int mask) {
+     return _mm512_int2mask(mask);
+   }
+#else
+   __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
+     return _mm_cvtss_f32(_mm512_castps512_ps128(v));
+   }
+   __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
+     return (int)k1;
+   }
+   __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
+     return (__mmask16)mask;
+   }
+#endif
+#endif
+}
--- a/thirdparty/embree/common/sys/library.cpp
+++ b/thirdparty/embree/common/sys/library.cpp
@@ -0,0 +1,83 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "library.h"
+#include "sysinfo.h"
+#include "filename.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  /* opens a shared library */
+  lib_t openLibrary(const std::string& file)
+  {
+    std::string fullName = file+".dll";
+    FileName executable = getExecutableFileName();
+    HANDLE handle = LoadLibrary((executable.path() + fullName).c_str());
+    return lib_t(handle);
+  }
+
+  /* returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym) {
+    return (void*)GetProcAddress(HMODULE(lib),sym.c_str());
+  }
+
+  /* closes the shared library */
+  void closeLibrary(lib_t lib) {
+    FreeLibrary(HMODULE(lib));
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <dlfcn.h>
+
+namespace embree
+{
+  /* opens a shared library */
+  lib_t openLibrary(const std::string& file)
+  {
+#if defined(__MACOSX__)
+    std::string fullName = "lib"+file+".dylib";
+#else
+    std::string fullName = "lib"+file+".so";
+#endif
+    void* lib = dlopen(fullName.c_str(), RTLD_NOW);
+    if (lib) return lib_t(lib);
+    FileName executable = getExecutableFileName();
+    lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW);
+    if (lib == nullptr) {
+      const char* error = dlerror();
+      if (error) { 
+        THROW_RUNTIME_ERROR(error);
+      } else {
+        THROW_RUNTIME_ERROR("could not load library "+executable.str());
+      }
+    }
+    return lib_t(lib);
+  }
+
+  /* returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym) {
+    return dlsym(lib,sym.c_str());
+  }
+
+  /* closes the shared library */
+  void closeLibrary(lib_t lib) {
+    dlclose(lib);
+  }
+}
+#endif
--- a/thirdparty/embree/common/sys/library.h
+++ b/thirdparty/embree/common/sys/library.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+  /*! type for shared library */
+  typedef struct opaque_lib_t* lib_t;
+
+  /*! loads a shared library */
+  lib_t openLibrary(const std::string& file);
+
+  /*! returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym);
+
+  /*! unloads a shared library */
+  void closeLibrary(lib_t lib);
+}
--- a/thirdparty/embree/common/sys/mutex.cpp
+++ b/thirdparty/embree/common/sys/mutex.cpp
@@ -0,0 +1,58 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mutex.h"
+#include "regression.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); }
+  MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; }
+  void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); }
+  bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; }
+  void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); }
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+  /*! system mutex using pthreads */
+  MutexSys::MutexSys() 
+  { 
+    mutex = new pthread_mutex_t; 
+    if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0)
+      THROW_RUNTIME_ERROR("pthread_mutex_init failed");
+  }
+  
+  MutexSys::~MutexSys() 
+  { 
+    MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
+    assert(ok);
+    delete (pthread_mutex_t*)mutex; 
+    mutex = nullptr;
+  }
+  
+  void MutexSys::lock() 
+  { 
+    if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) 
+      THROW_RUNTIME_ERROR("pthread_mutex_lock failed");
+  }
+  
+  bool MutexSys::try_lock() { 
+    return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0;
+  }
+  
+  void MutexSys::unlock() 
+  { 
+    if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0)
+      THROW_RUNTIME_ERROR("pthread_mutex_unlock failed");
+  }
+};
+#endif
--- a/thirdparty/embree/common/sys/mutex.h
+++ b/thirdparty/embree/common/sys/mutex.h
@@ -0,0 +1,104 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "intrinsics.h"
+#include "atomic.h"
+
+#define CPU_CACHELINE_SIZE 64
+namespace embree
+{
+  /*! system mutex */
+  class MutexSys {
+    friend struct ConditionImplementation;
+  public:
+    MutexSys();
+    ~MutexSys();
+
+  private:
+    MutexSys (const MutexSys& other) DELETED; // do not implement
+    MutexSys& operator= (const MutexSys& other) DELETED; // do not implement
+
+  public:
+    void lock();
+    bool try_lock();
+    void unlock();
+
+  protected:
+    void* mutex;
+  };
+
+  /*! spinning mutex */
+  class SpinLock
+  {
+  public:
+ 
+    SpinLock ()
+      : flag(false) {}
+
+    __forceinline bool isLocked() {
+      return flag.load();
+    }
+
+    __forceinline void lock()
+    {
+      while (true) 
+      {
+        while (flag.load()) 
+        {
+          _mm_pause(); 
+          _mm_pause();
+        }
+        
+        bool expected = false;
+        if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire))
+          break;
+      }
+    }
+    
+    __forceinline bool try_lock()
+    {
+      bool expected = false;
+      if (flag.load() != expected) {
+        return false;
+      }
+      return flag.compare_exchange_strong(expected,true,std::memory_order_acquire);
+    }
+
+    __forceinline void unlock() {
+      flag.store(false,std::memory_order_release);
+    }
+    
+    __forceinline void wait_until_unlocked() 
+    {
+      while(flag.load())
+      {
+        _mm_pause(); 
+        _mm_pause();
+      }
+    }
+
+  public:
+    atomic<bool> flag;
+  };
+
+  class PaddedSpinLock : public SpinLock
+  {
+  private:
+    MAYBE_UNUSED char padding[CPU_CACHELINE_SIZE - sizeof(SpinLock)];
+  };
+  /*! safe mutex lock and unlock helper */
+  template<typename Mutex> class Lock {
+  public:
+    Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); }
+    Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {}
+    ~Lock() { if (locked) mutex.unlock(); }
+    __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); }
+    __forceinline bool isLocked() const { return locked; }
+  protected:
+    Mutex& mutex;
+    bool locked;
+  };
+}
--- a/thirdparty/embree/common/sys/platform.h
+++ b/thirdparty/embree/common/sys/platform.h
@@ -0,0 +1,616 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if !defined(_CRT_SECURE_NO_WARNINGS)
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <cstddef>
+#include <cassert>
+#include <cstdlib>
+#include <cstdio>
+#include <memory>
+#include <stdexcept>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <string>
+#include <cstring>
+#include <stdint.h>
+#include <functional>
+#include <mutex>
+
+#if defined(EMBREE_SYCL_SUPPORT)
+
+#define __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#pragma clang diagnostic ignored "-W#pragma-messages"
+
+#include <sycl/sycl.hpp>
+
+#pragma clang diagnostic pop
+
+#include "sycl.h"
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#define CONSTANT __attribute__((opencl_constant))
+#else
+#define CONSTANT
+#endif
+
+#endif
+
+
+////////////////////////////////////////////////////////////////////////////////
+/// detect platform
+////////////////////////////////////////////////////////////////////////////////
+
+/* detect 32 or 64 Intel platform */
+#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __X86_64__
+#define __X86_ASM__
+#elif defined(__i386__) || defined(_M_IX86)
+#define __X86_ASM__
+#endif
+
+/* detect 64 bit platform */
+#if defined(__X86_64__) || defined(__aarch64__)
+#define __64BIT__
+#endif
+
+/* detect Linux platform */
+#if defined(linux) || defined(__linux__) || defined(__LINUX__)
+#  if !defined(__LINUX__)
+#     define __LINUX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect FreeBSD platform */
+#if defined(__FreeBSD__) || defined(__FREEBSD__)
+#  if !defined(__FREEBSD__)
+#     define __FREEBSD__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */
+#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__)
+#  if !defined(__WIN32__)
+#     define __WIN32__
+#  endif
+#endif
+
+/* detect Cygwin platform */
+#if defined(__CYGWIN__)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect MAC OS X platform */
+#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__)
+#  if !defined(__MACOSX__)
+#     define __MACOSX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* try to detect other Unix systems */
+#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Macros
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __WIN32__
+#  if defined(EMBREE_STATIC_LIB)
+#    define dll_export
+#    define dll_import
+#  else
+#    define dll_export __declspec(dllexport)
+#    define dll_import __declspec(dllimport)
+#  endif
+#else
+#  define dll_export __attribute__ ((visibility ("default")))
+#  define dll_import
+#endif
+
+#if defined(__WIN32__) && !defined(__MINGW32__)
+#if !defined(__noinline)
+#define __noinline             __declspec(noinline)
+#endif
+//#define __forceinline        __forceinline
+//#define __restrict           __restrict
+#if defined(__INTEL_COMPILER)
+#define __restrict__           __restrict
+#else
+#define __restrict__           //__restrict // causes issues with MSVC
+#endif
+#if !defined(__thread) && !defined(__INTEL_LLVM_COMPILER)
+#define __thread               __declspec(thread)
+#endif
+#if !defined(__aligned)
+#define __aligned(...)           __declspec(align(__VA_ARGS__))
+#endif
+//#define __FUNCTION__           __FUNCTION__
+#define debugbreak()           __debugbreak()
+
+#else
+#if !defined(__noinline)
+#define __noinline             __attribute__((noinline))
+#endif
+#if !defined(__forceinline)
+#define __forceinline          inline __attribute__((always_inline))
+#endif
+//#define __restrict             __restrict
+//#define __thread               __thread
+#if !defined(__aligned)
+#define __aligned(...)           __attribute__((aligned(__VA_ARGS__)))
+#endif
+#if !defined(__FUNCTION__)
+#define __FUNCTION__           __PRETTY_FUNCTION__
+#endif
+#define debugbreak()           asm ("int $3")
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+  #define MAYBE_UNUSED __attribute__((unused))
+#else
+  #define MAYBE_UNUSED
+#endif
+
+#if !defined(_unused)
+#define _unused(x) ((void)(x))
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly
+  #define DELETED
+#else
+  #define DELETED  = delete
+#endif
+
+#if !defined(likely)
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) || defined(__SYCL_DEVICE_ONLY__)
+#define   likely(expr) (expr)
+#define unlikely(expr) (expr)
+#else
+#define   likely(expr) __builtin_expect((bool)(expr),true )
+#define unlikely(expr) __builtin_expect((bool)(expr),false)
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Error handling and debugging
+////////////////////////////////////////////////////////////////////////////////
+
+/* debug printing macros */
+#define STRING(x) #x
+#define TOSTRING(x) STRING(x)
+#define PING embree_cout_uniform << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl
+#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl
+#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl
+#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl
+#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+
+#define UPRINT(x) embree_cout_uniform << STRING(x) << " = " << (x) << embree_endl
+#define UPRINT2(x,y) embree_cout_uniform << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl
+#define UPRINT3(x,y,z) embree_cout_uniform << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl
+#define UPRINT4(x,y,z,w) embree_cout_uniform << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+
+#if defined(DEBUG) // only report file and line in debug mode
+  #define THROW_RUNTIME_ERROR(str) \
+    printf("%s (%d): %s", __FILE__, __LINE__, std::string(str).c_str()), abort();
+    //throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+#else
+  #define THROW_RUNTIME_ERROR(str) \
+    abort(); //throw std::runtime_error(str);
+#endif
+
+#define FATAL(x)   THROW_RUNTIME_ERROR(x)
+#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; }
+
+#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented")
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic types
+////////////////////////////////////////////////////////////////////////////////
+
+/* default floating-point type */
+namespace embree {
+  typedef float real;
+}
+
+/* windows does not have ssize_t */
+#if defined(__WIN32__)
+#if defined(__64BIT__)
+typedef int64_t ssize_t;
+#else
+typedef int32_t ssize_t;
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic utility functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline std::string toString(long long value) {
+  return std::to_string(value);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Disable some compiler warnings
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__INTEL_COMPILER)
+//#pragma warning(disable:265 ) // floating-point operation result is out of range
+//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used
+//#pragma warning(disable:869 ) // parameter was never referenced
+//#pragma warning(disable:981 ) // operands are evaluated in unspecified order
+//#pragma warning(disable:1418) // external function definition with no prior declaration
+//#pragma warning(disable:1419) // external declaration in primary source file
+//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable
+//#pragma warning(disable:94  ) // the size of an array must be greater than zero
+//#pragma warning(disable:1599) // declaration hides parameter
+//#pragma warning(disable:424 ) // extra ";" ignored
+#pragma warning(disable:2196) // routine is both "inline" and "noinline"
+//#pragma warning(disable:177 ) // label was declared but never referenced
+//#pragma warning(disable:114 ) // function was referenced but not defined
+//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function
+#pragma warning(disable:15335)  // was not vectorized: vectorization possible but seems inefficient
+#endif
+
+#if defined(_MSC_VER)
+//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union
+#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning)
+//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data
+#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data
+#pragma warning(disable:4267) // conversion from 'size_t' to 'const int', possible loss of data
+//#pragma warning(disable:4355) // 'this' : used in base member initializer list
+//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch
+//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch
+//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float'
+//#pragma warning(disable:4068) // unknown pragma
+//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned
+//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion)
+//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored
+#pragma warning(disable:4503) // decorated name length exceeded, name was truncated
+#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored
+#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used
+
+#  if _MSC_VER < 1910 // prior to Visual studio 2017 (V141)
+#    pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings
+#    pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0
+#  endif
+
+#endif
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+//#pragma clang diagnostic ignored "-Wunknown-pragmas"
+//#pragma clang diagnostic ignored "-Wunused-variable"
+//#pragma clang diagnostic ignored "-Wreorder"
+//#pragma clang diagnostic ignored "-Wmicrosoft"
+//#pragma clang diagnostic ignored "-Wunused-private-field"
+//#pragma clang diagnostic ignored "-Wunused-local-typedef"
+//#pragma clang diagnostic ignored "-Wunused-function"
+//#pragma clang diagnostic ignored "-Wnarrowing"
+//#pragma clang diagnostic ignored "-Wc++11-narrowing"
+//#pragma clang diagnostic ignored "-Wdeprecated-register"
+//#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wpragmas"
+//#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+//#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+//#pragma GCC diagnostic ignored "-Warray-bounds"
+#pragma GCC diagnostic ignored "-Wattributes"
+#pragma GCC diagnostic ignored "-Wmisleading-indentation"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+#if defined(__clang__) && defined(__WIN32__)
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wmicrosoft-cast"
+#pragma clang diagnostic ignored "-Wmicrosoft-enum-value"
+#pragma clang diagnostic ignored "-Wmicrosoft-include"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunknown-pragmas"
+#endif
+
+/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */
+#if defined(__WIN32__) && defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable:  1478)) // warning: function was declared deprecated
+#elif defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("warning (enable : 1478)") // warning: function was declared deprecated
+#elif defined(__clang__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(__GNUC__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(_MSC_VER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable : 4996)) // warning: function was declared deprecated
+#endif
+ 
+////////////////////////////////////////////////////////////////////////////////
+/// SYCL specific
+////////////////////////////////////////////////////////////////////////////////
+
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+
+#define sycl_printf0(format, ...) {               \
+    static const CONSTANT char fmt[] = format;               \
+    if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))       \
+      sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ );  \
+  }
+
+#define sycl_printf0_(format) {               \
+    static const CONSTANT char fmt[] = format;               \
+    if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))       \
+      sycl::ext::oneapi::experimental::printf(fmt);                \
+  }
+
+#else
+
+#define sycl_printf0(format, ...) {                          \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ );    \
+  }
+
+#define sycl_printf0_(format) {                              \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt);                  \
+  }
+
+#endif
+
+#define sycl_printf(format, ...) {               \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ );    \
+  }
+
+#define sycl_printf_(format) {               \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt);                  \
+  }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+
+namespace embree
+{
+  struct sycl_ostream_ {
+    sycl_ostream_ (bool uniform) : uniform(uniform) {}
+    bool uniform = false;
+  };
+  struct sycl_endl_ {};
+
+#define embree_ostream embree::sycl_ostream_
+#define embree_cout embree::sycl_ostream_(false)
+#define embree_cout_uniform embree::sycl_ostream_(true)
+#define embree_endl embree::sycl_endl_()
+  
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, int   i)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%i",i);
+    }
+    else
+      sycl_printf("%i ",i);
+    
+    return cout;
+  }
+  
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, unsigned int i)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%u",i);
+    } else
+      sycl_printf("%u ",i);
+
+    return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, float f)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%f",f);
+    } else
+      sycl_printf("%f ",f);
+
+    return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, double d)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%f",d);
+    } else
+      sycl_printf("%f ",d);
+
+    return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, uint64_t l)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%lu",l);
+    } else
+      sycl_printf("%lu ",l);
+    
+    return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, long l)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%l",l);
+    } else
+      sycl_printf("%l ",l);
+    
+    return cout;
+  }
+  
+    
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, void* p)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%p",p);
+    } else
+      sycl_printf("%p ",p);
+    
+    return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, const char* c)
+  {
+     if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+       sycl_printf("%s",c);
+     return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, sycl_endl_)
+  {
+    if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+      sycl_printf_("\n");
+    return cout;
+  }
+}
+
+#else
+
+#define embree_ostream std::ostream&
+#define embree_cout std::cout
+#define embree_cout_uniform std::cout
+#define embree_endl std::endl
+
+#endif
+
+#if defined(EMBREE_SYCL_SUPPORT)
+
+  /* printing out sycle vector types */
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::float4& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z() << "," << v.w() << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::float3& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z()  << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::float2& v) {
+    return out << "(" << v.x() << "," << v.y()  << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::int4& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z() << "," << v.w() << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::int3& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z()  << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::int2& v) {
+    return out << "(" << v.x() << "," << v.y()  << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::uint4& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z() << "," << v.w() << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::uint3& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z()  << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::uint2& v) {
+    return out << "(" << v.x() << "," << v.y()  << ")";
+  }
+
+#endif
+
+inline void tab(std::ostream& cout, int n) {
+  for (int i=0; i<n; i++) cout << "  ";
+}
+
+inline std::string tab(int depth) {
+  return std::string(2*depth,' ');
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Some macros for static profiling
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__GNUC__) 
+#define IACA_SSC_MARK( MARK_ID )						\
+__asm__ __volatile__ (									\
+					  "\n\t  movl $"#MARK_ID", %%ebx"	\
+					  "\n\t  .byte 0x64, 0x67, 0x90"	\
+					  : : : "memory" );
+
+#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B");
+
+#else
+#define IACA_UD_BYTES {__asm _emit 0x0F \
+	__asm _emit 0x0B}
+
+#define IACA_SSC_MARK(x) {__asm  mov ebx, x\
+	__asm  _emit 0x64 \
+	__asm  _emit 0x67 \
+	__asm  _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END   __writegsbyte(222, 222);
+
+#endif
+
+#define IACA_START {IACA_UD_BYTES \
+					IACA_SSC_MARK(111)}
+#define IACA_END {IACA_SSC_MARK(222) \
+					IACA_UD_BYTES}
+
+namespace embree
+{
+  template<typename Closure>
+    struct OnScopeExitHelper
+  {
+    OnScopeExitHelper (const Closure f) : active(true), f(f) {}
+    ~OnScopeExitHelper() { if (active) f(); }
+    void deactivate() { active = false; }
+    bool active;
+    const Closure f;
+  };
+  
+  template <typename Closure>
+    OnScopeExitHelper<Closure> OnScopeExit(const Closure f) {
+    return OnScopeExitHelper<Closure>(f);
+  }
+
+#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2)
+#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define ON_SCOPE_EXIT(code)                                             \
+  auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;})
+
+  template<typename Ty>
+    std::unique_ptr<Ty> make_unique(Ty* ptr) {
+    return std::unique_ptr<Ty>(ptr);
+  }
+
+}
--- a/thirdparty/embree/common/sys/ref.h
+++ b/thirdparty/embree/common/sys/ref.h
@@ -0,0 +1,122 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "atomic.h"
+
+namespace embree
+{
+  struct NullTy {
+  };
+
+  extern MAYBE_UNUSED NullTy null;
+  
+  class RefCount
+  {
+  public:
+    RefCount(int val = 0) : refCounter(val) {}
+    virtual ~RefCount() {};
+  
+    virtual RefCount* refInc() { refCounter.fetch_add(1); return this; }
+    virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; }
+  private:
+    std::atomic<size_t> refCounter;
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reference to single object
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename Type>
+  class Ref
+  {
+  public:
+    Type* ptr;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Ref() : ptr(nullptr) {}
+    __forceinline Ref(NullTy) : ptr(nullptr) {}
+    __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); }
+    __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; }
+
+    __forceinline Ref(Type* const input) : ptr(input)
+    {
+      if (ptr)
+        ptr->refInc();
+    }
+
+    __forceinline ~Ref()
+    {
+      if (ptr)
+        ptr->refDec();
+    }
+
+    __forceinline Ref& operator =(const Ref& input)
+    {
+      if (input.ptr)
+        input.ptr->refInc();
+      if (ptr)
+        ptr->refDec();
+      ptr = input.ptr;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(Ref&& input)
+    {
+      if (ptr)
+        ptr->refDec();
+      ptr = input.ptr;
+      input.ptr = nullptr;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(Type* const input)
+    {
+      if (input)
+        input->refInc();
+      if (ptr)
+        ptr->refDec();
+      ptr = input;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(NullTy)
+    {
+      if (ptr)
+        ptr->refDec();
+      ptr = nullptr;
+      return *this;
+    }
+
+    __forceinline operator bool() const { return ptr != nullptr; }
+
+    __forceinline const Type& operator  *() const { return *ptr; }
+    __forceinline       Type& operator  *()       { return *ptr; }
+    __forceinline const Type* operator ->() const { return  ptr; }
+    __forceinline       Type* operator ->()       { return  ptr; }
+
+    template<typename TypeOut>
+    __forceinline       Ref<TypeOut> cast()       { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+    template<typename TypeOut>
+    __forceinline const Ref<TypeOut> cast() const { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+
+    template<typename TypeOut>
+    __forceinline       Ref<TypeOut> dynamicCast()       { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+    template<typename TypeOut>
+    __forceinline const Ref<TypeOut> dynamicCast() const { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+  };
+
+  template<typename Type> __forceinline bool operator < (const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   <  b.ptr;   }
+
+  template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, NullTy            ) { return a.ptr   == nullptr; }
+  template<typename Type> __forceinline bool operator ==(NullTy            , const Ref<Type>& b) { return nullptr == b.ptr;   }
+  template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   == b.ptr;   }
+
+  template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, NullTy            ) { return a.ptr   != nullptr; }
+  template<typename Type> __forceinline bool operator !=(NullTy            , const Ref<Type>& b) { return nullptr != b.ptr;   }
+  template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   != b.ptr;   }
+}
--- a/Show More
+++ b/Show More