Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ jobs:
fi
if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128"
CXXFLAGS="$CXXFLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oopsie. Thanks for fixing this one.

fi
if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
Expand Down
136 changes: 68 additions & 68 deletions include/xsimd/arch/common/xsimd_common_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#define XSIMD_COMMON_MEMORY_HPP

#include "../../types/xsimd_batch_constant.hpp"
#include "../../utils/xsimd_type_traits.hpp"
#include "./xsimd_common_details.hpp"

#include <algorithm>
Expand Down Expand Up @@ -360,88 +361,87 @@ namespace xsimd
return load_unaligned<A>(mem, convert<T> {}, A {});
}

template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE batch<T_out, A>
load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...>, convert<T_out>, alignment, requires_arch<common>) noexcept
{
constexpr std::size_t size = batch<T_out, A>::size;
alignas(A::alignment()) std::array<T_out, size> buffer {};
constexpr bool mask[size] = { Values... };

for (std::size_t i = 0; i < size; ++i)
buffer[i] = mask[i] ? static_cast<T_out>(mem[i]) : T_out(0);

return batch<T_out, A>::load(buffer.data(), aligned_mode {});
}

template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE void
store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
// Masked-memory dispatch idiom. To give an arch a native masked path, add a
// `requires_arch<that-arch>` overload in its arch file; conversion ranking makes
// it beat the inherited one. Keep this base layer arch-agnostic:
// (a) specialize via a concrete `requires_arch<arch>` overload -- no register
// tag, no `enable_if` on `A`;
// (b) base overloads use the `requires_arch<common>` tag only; a generic
// `requires_arch<A>` here ties with an arch's own overload (gcc-10 ambiguity);
// (c) capability decisions go through arch-agnostic traits (see below).
namespace detail
{
constexpr std::size_t size = batch<T_in, A>::size;
constexpr bool mask[size] = { Values... };
// True when an integer access can borrow the same-width float `vmaskmov*` path
// (integral type, same-size float exists, arch has that float register);
// otherwise the scalar-buffer fallback is used. Names no architecture.
template <class A, class T_in, class T_out>
using masked_memory_uses_fp_bitcast = std::integral_constant<bool,
std::is_same<T_in, T_out>::value
&& std::is_integral<T_out>::value
&& !std::is_void<sized_fp_t<sizeof(T_out)>>::value
&& types::has_simd_register<sized_fp_t<sizeof(T_out)>, A>::value>;

for (std::size_t i = 0; i < size; ++i)
if (mask[i])
{
mem[i] = static_cast<T_out>(src.get(i));
}
}
// Scalar-buffer fallback: materialize masked-off lanes as zero, then load.
template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE batch<T_out, A>
load_masked_common(T_in const* mem, batch_bool_constant<T_out, A, Values...>, convert<T_out>, alignment, std::false_type /* uses_fp_bitcast */) noexcept
{
constexpr std::size_t size = batch<T_out, A>::size;
alignas(A::alignment()) std::array<T_out, size> buffer {};
constexpr bool mask[size] = { Values... };

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
{
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
return bitwise_cast<int32_t>(f);
}
for (std::size_t i = 0; i < size; ++i)
buffer[i] = mask[i] ? static_cast<T_out>(mem[i]) : T_out(0);

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
{
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
return bitwise_cast<uint32_t>(f);
}
return batch<T_out, A>::load(buffer.data(), aligned_mode {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<int64_t, A>>
load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept
{
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
return bitwise_cast<int64_t>(d);
}
// Integer-via-float path: reinterpret to the same-width float type, reuse the
// floating-point masked load (e.g. `vmaskmovps`), then bitcast the result back.
template <class A, class T, bool... Values, class Mode>
XSIMD_INLINE batch<T, A>
load_masked_common(T const* mem, batch_bool_constant<T, A, Values...>, convert<T>, Mode, std::true_type /* uses_fp_bitcast */) noexcept
{
using fp_t = sized_fp_t<sizeof(T)>;
const auto f = ::xsimd::kernel::load_masked<A>(reinterpret_cast<const fp_t*>(mem), batch_bool_constant<fp_t, A, Values...> {}, convert<fp_t> {}, Mode {}, A {});
return bitwise_cast<T>(f);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<uint64_t, A>>
load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept
{
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
return bitwise_cast<uint64_t>(d);
}
template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE void
store_masked_common(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, std::false_type /* uses_fp_bitcast */) noexcept
{
constexpr std::size_t size = batch<T_in, A>::size;
constexpr bool mask[size] = { Values... };

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
}
for (std::size_t i = 0; i < size; ++i)
if (mask[i])
{
mem[i] = static_cast<T_out>(src.get(i));
}
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
template <class A, class T, bool... Values, class Mode>
XSIMD_INLINE void
store_masked_common(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...>, Mode, std::true_type /* uses_fp_bitcast */) noexcept
{
using fp_t = sized_fp_t<sizeof(T)>;
::xsimd::kernel::store_masked<A>(reinterpret_cast<fp_t*>(mem), bitwise_cast<fp_t>(src), batch_bool_constant<fp_t, A, Values...> {}, Mode {}, A {});
}
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept
template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE batch<T_out, A>
load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...> mask, convert<T_out> cvt, alignment mode, requires_arch<common>) noexcept
{
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
return detail::load_masked_common(mem, mask, cvt, mode, detail::masked_memory_uses_fp_bitcast<A, T_in, T_out> {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept
template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE void
store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...> mask, alignment mode, requires_arch<common>) noexcept
{
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
detail::store_masked_common(mem, src, mask, mode, detail::masked_memory_uses_fp_bitcast<A, T_in, T_out> {});
}

template <class A, class T_in, class T_out>
Expand Down
54 changes: 34 additions & 20 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -993,19 +993,20 @@ namespace xsimd
{
using int_t = as_integer_t<T>;
constexpr size_t half_size = batch<T, A>::size / 2;
using half_arch = typename ::xsimd::make_sized_batch_t<T, half_size>::arch_type;

// confined to lower 128-bit half → forward to 128 bit
// lower 128-bit half
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(batch_bool_constant<int_t, A, Values...> {});
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, half_arch {});
return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
}
// confined to upper 128-bit half → forward to 128 bit
// upper 128-bit half
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, half_arch {});
return detail::zero_extend<A>(hi);
}
else
Expand All @@ -1018,41 +1019,54 @@ namespace xsimd
// store_masked
namespace detail
{
template <class A>
// True when batch_bool<T, A> is the legacy VEX vector mask, i.e. it is stored
// in the same register as the data (__m256 / __m256d) rather than in an EVEX
// k-register (__mmask8) as on the avx512vl architectures. The _mm256_cast*_si256
// path below is only well-formed for the vector-mask representation. This names
// no architecture — it tests the mask's representation, in the spirit of
// detail::masked_memory_uses_fp_bitcast.
template <class T, class A>
using uses_vector_mask = std::is_same<typename batch_bool<T, A>::register_type,
typename batch<T, A>::register_type>;

template <class A, class = std::enable_if_t<uses_vector_mask<float, A>::value>>
XSIMD_INLINE void maskstore(float* mem, batch_bool<float, A> const& mask, batch<float, A> const& src) noexcept
{
_mm256_maskstore_ps(mem, mask, src);
_mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src);
}

template <class A>
template <class A, class = std::enable_if_t<uses_vector_mask<double, A>::value>>
XSIMD_INLINE void maskstore(double* mem, batch_bool<double, A> const& mask, batch<double, A> const& src) noexcept
{
_mm256_maskstore_pd(mem, mask, src);
_mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src);
}
}

template <class A, class T, bool... Values, class Mode>
template <class A, class T, bool... Values, class Mode,
typename = std::enable_if_t<std::is_floating_point<T>::value && detail::uses_vector_mask<T, A>::value>>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
{
constexpr size_t half_size = batch<T, A>::size / 2;
using half_batch = ::xsimd::make_sized_batch_t<T, half_size>;
using half_arch = typename half_batch::arch_type;

// confined to lower 128-bit half → forward to 128 bit
// lower 128-bit half
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
const half_batch lo = detail::lower_half(src);
store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
}
// confined to upper 128-bit half → forward to 128 bit
// upper 128-bit half
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
const auto hi = detail::upper_half(src);
store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
const half_batch hi = detail::upper_half(src);
store_masked<half_arch>(mem + half_size, hi, mhi, Mode {}, half_arch {});
}
else
{
detail::maskstore(mem, mask.as_batch(), src);
detail::maskstore(mem, mask.as_batch_bool(), src);
}
}

Expand Down
25 changes: 14 additions & 11 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,24 +190,27 @@ namespace xsimd
}
}

template <class A, class T, bool... Values, class Mode>
template <class A, class T, bool... Values, class Mode,
typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4)>>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
{
constexpr size_t lanes_per_half = batch<T, A>::size / 2;
using half_batch = ::xsimd::make_sized_batch_t<T, lanes_per_half>;
using half_arch = typename half_batch::arch_type;

// confined to lower 128-bit half → forward to SSE
// lower 128-bit half
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
const half_batch lo = detail::lower_half(src);
store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
}
// confined to upper 128-bit half → forward to SSE
// upper 128-bit half
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
const auto hi = detail::upper_half(src);
store_masked<sse4_2>(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {});
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
const half_batch hi = detail::upper_half(src);
store_masked<half_arch>(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {});
}
else
{
Expand All @@ -216,10 +219,10 @@ namespace xsimd
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
{
const auto s32 = bitwise_cast<int32_t>(src);
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, mask, Mode {}, avx2 {});
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, batch_bool_constant<int32_t, A, Values...> {}, Mode {}, avx2 {});
}

template <class A, bool... Values, class Mode>
Expand Down
Loading
Loading