17template <
class F,
class Abi>
19 assert(s <= 0 ||
static_cast<size_t>(+s) < x.size());
20 assert(s >= 0 ||
static_cast<size_t>(-s) < x.size());
21 const int n = x.size();
25template <
int S,
class F,
class Abi>
27 static_assert(S > 0 && S < x.size());
28 const int n = x.size();
32template <
int S,
class F,
class Abi>
34 static_assert(S > 0 && S < x.size());
35 const int n = x.size();
39template <
int S,
class F,
class Abi>
41 static_assert(S > 0 && S < x.size());
42 const int n = x.size();
46template <
int S,
class F,
class Abi>
48 static_assert(S > 0 && S < x.size());
52#if defined(__AVX512F__)
55 assert(s <= 0 ||
static_cast<size_t>(+s) < x.size());
56 assert(s >= 0 ||
static_cast<size_t>(-s) < x.size());
57 constexpr size_t N = x.size();
58 static constinit std::array<int64_t, 2 * N - 1> indices_lut = [] {
59 std::array<int64_t, 2 * N - 1> lut{};
60 for (
size_t i = 0; i < 2 * N - 1; ++i)
61 lut[i] =
static_cast<int64_t
>((i + 1) % N);
80 static constinit const int64_t *p = indices_lut.data() + N - 1;
83 const __m512i indices = _mm512_loadu_epi64(p - s);
84 __m512d y = _mm512_permutexvar_pd(indices,
static_cast<__m512d
>(x));
85 return decltype(x){y};
89 assert(s <= 0 ||
static_cast<size_t>(+s) < x.size());
90 assert(s >= 0 ||
static_cast<size_t>(-s) < x.size());
91 constexpr size_t N = x.size();
92 static constinit std::array<int64_t, 2 * N - 1> indices_lut = [] {
93 std::array<int64_t, 2 * N - 1> lut{};
94 for (
size_t i = 0; i < 2 * N - 1; ++i)
95 lut[i] =
static_cast<int64_t
>((i + 1) % N);
98 static constinit const int64_t *p = indices_lut.data() + N - 1;
101 const __m256i indices = _mm256_loadu_epi64(p - s);
102 __m256d y = _mm256_permutexvar_pd(indices,
static_cast<__m256d
>(x));
103 return decltype(x){y};
108 static_assert(S > 0 && S < x.size());
109 constexpr size_t N = x.size();
110 const __m512i indices = _mm512_set_epi64((S + 7) % N, (S + 6) % N, (S + 5) % N, (S + 4) % N,
111 (S + 3) % N, (S + 2) % N, (S + 1) % N, S % N);
112 __m512d y = _mm512_permutexvar_pd(indices,
static_cast<__m512d
>(x));
113 return decltype(x){y};
118 static_assert(S > 0 && S < x.size());
119 constexpr size_t N = x.size();
120 const __m512i indices =
121 _mm512_set_epi64((N - S + 7) % N, (N - S + 6) % N, (N - S + 5) % N, (N - S + 4) % N,
122 (N - S + 3) % N, (N - S + 2) % N, (N - S + 1) % N, (N - S) % N);
123 __m512d y = _mm512_permutexvar_pd(indices,
static_cast<__m512d
>(x));
124 return decltype(x){y};
129 static_assert(S > 0 && S < x.size());
130 constexpr uint8_t mask = (1u << (x.size() - S)) - 1u;
131 auto y =
static_cast<__m512d
>(
rotl<S>(x));
132 y = _mm512_mask_blend_pd(mask, _mm512_set1_pd(0), y);
133 return decltype(x){y};
138 static_assert(S > 0 && S < x.size());
139 constexpr uint8_t mask = (1u << S) - 1u;
140 auto y =
static_cast<__m512d
>(
rotr<S>(x));
141 y = _mm512_mask_blend_pd(mask, y, _mm512_set1_pd(0));
142 return decltype(x){y};
151 static_assert(S > 0 && S < x.size());
152 constexpr uint8_t mask = (1u << (x.size() - S)) - 1u;
153 auto y =
static_cast<__m256d
>(
rotl<S>(x));
154 y = _mm256_blend_pd(_mm256_set1_pd(0), y, mask);
155 return decltype(x){y};
160 static_assert(S > 0 && S < x.size());
161 constexpr uint8_t mask = (1u << S) - 1u;
162 auto y =
static_cast<__m256d
>(
rotr<S>(x));
163 y = _mm256_blend_pd(y, _mm256_set1_pd(0), mask);
164 return decltype(x){y};
169#if defined(__AVX512F__)
173 static_assert(S > 0 && S < x.size());
174 constexpr size_t N = x.size();
175 const __m256i indices = _mm256_set_epi64x((S + 3) % N, (S + 2) % N, (S + 1) % N, S % N);
176 __m256d y = _mm256_permutexvar_pd(indices,
static_cast<__m256d
>(x));
177 return decltype(x){y};
182 static_assert(S > 0 && S < x.size());
183 constexpr size_t N = x.size();
184 const __m256i indices =
185 _mm256_set_epi64x((N - S + 3) % N, (N - S + 2) % N, (N - S + 1) % N, (N - S) % N);
186 __m256d y = _mm256_permutexvar_pd(indices,
static_cast<__m256d
>(x));
187 return decltype(x){y};
190#elif defined(__AVX2__)
194 static_assert(S > 0 && S < x.size());
195 constexpr size_t N = x.size();
196 constexpr int indices =
197 (((S + 3) % N) << 6) | (((S + 2) % N) << 4) | (((S + 1) % N) << 2) | (S % N);
198 __m256d y = _mm256_permute4x64_pd(
static_cast<__m256d
>(x), indices);
199 return decltype(x){y};
204 static_assert(S > 0 && S < x.size());
205 constexpr size_t N = x.size();
206 constexpr int indices = (((N - S + 3) % N) << 6) | (((N - S + 2) % N) << 4) |
207 (((N - S + 1) % N) << 2) | ((N - S) % N);
208 __m256d y = _mm256_permute4x64_pd(
static_cast<__m256d
>(x), indices);
209 return decltype(x){y};
225template <
int S,
class F,
class Abi>
227 if constexpr (S % x.size() == 0)
229 else if constexpr (S < 0)
238template <
int S,
class F,
class Abi>
240 if constexpr (S % x.size() == 0)
242 else if constexpr (S < 0)
251template <
int S,
class F,
class Abi>
253 if constexpr (S == 0)
255 else if constexpr (S >=
static_cast<int>(x.size()) || -S >=
static_cast<int>(x.size()))
257 else if constexpr (S < 0)
266template <
int S,
class F,
class Abi>
268 if constexpr (S == 0)
270 else if constexpr (S >=
static_cast<int>(x.size()) || -S >=
static_cast<int>(x.size()))
272 else if constexpr (S < 0)
datapar::simd< F, Abi > rotl(datapar::simd< F, Abi > x)
Rotates the elements of x by s positions to the left.
datapar::simd< F, Abi > rotr(datapar::simd< F, Abi > x)
Rotate the elements of x to the right by S positions.
datapar::simd< F, Abi > shiftl(datapar::simd< F, Abi > x)
Shift the elements of x to the left by S positions, shifting in zeros.
datapar::simd< F, Abi > shiftr(datapar::simd< F, Abi > x)
Shift the elements of x to the right by S positions, shifting in zeros.
simd< Tp, deduced_abi< Tp, Np > > deduced_simd
stdx::simd< Tp, Abi > simd
datapar::simd< F, Abi > shiftl(datapar::simd< F, Abi > x)
datapar::simd< F, Abi > rotr(datapar::simd< F, Abi > x)
datapar::simd< F, Abi > rotl(datapar::simd< F, Abi > x)
datapar::simd< F, Abi > shiftr(datapar::simd< F, Abi > x)
datapar::simd< F, Abi > rot(datapar::simd< F, Abi > x, int s)
Rotate the elements of x to the right by s positions.