batmat 0.0.18
Batched linear algebra routines
Loading...
Searching...
No Matches
transpose.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <batmat/assume.hpp>
4#include <batmat/config.hpp>
5#include <batmat/simd.hpp>
6#include <batmat/unroll.h>
7
8namespace batmat::ops {
9
10/// @addtogroup topic-low-level-ops
11/// @{
12
13/// @name Transposition
14/// @{
15
16/// Transposes the @p R × @p C matrix at @p pa with leading dimension @p lda, writing the result to
17/// @p pb with leading dimension @p ldb, writing only the @p d first columns of the result.
18template <index_t R, index_t C, class T>
19[[gnu::always_inline]]
20inline void transpose_dyn(const T *pa, index_t lda, T *pb, index_t ldb, index_t d = R) {
21 BATMAT_ASSUME(d <= R);
22 T r[C][R];
23 BATMAT_FULLY_UNROLLED_FOR (int i = 0; i < C; ++i)
24 BATMAT_FULLY_UNROLLED_FOR (int j = 0; j < R; ++j)
25 r[i][j] = pa[i * lda + j];
26 BATMAT_FULLY_UNROLLED_FOR (int i = 0; i < d; ++i)
27 BATMAT_FULLY_UNROLLED_FOR (int j = 0; j < C; ++j)
28 pb[i * ldb + j] = r[j][i];
29}
30
31#ifdef __AVX2__
32template <>
33[[gnu::always_inline]]
34inline void transpose_dyn<4, 4>(const double *pa, index_t lda, double *pb, index_t ldb, index_t d) {
36 simd cols[4], shuf[4];
37 BATMAT_FULLY_UNROLLED_FOR (int i = 0; i < 4; ++i)
38 cols[i] = datapar::unaligned_load<simd>(pa + i * lda);
39 // clang-format off
40 shuf[0] = simd{_mm256_shuffle_pd((__m256d)cols[0], (__m256d)cols[1], 0b0000)};
41 shuf[1] = simd{_mm256_shuffle_pd((__m256d)cols[0], (__m256d)cols[1], 0b1111)};
42 shuf[2] = simd{_mm256_shuffle_pd((__m256d)cols[2], (__m256d)cols[3], 0b0000)};
43 shuf[3] = simd{_mm256_shuffle_pd((__m256d)cols[2], (__m256d)cols[3], 0b1111)};
44 cols[0] = simd{_mm256_permute2f128_pd((__m256d)shuf[0], (__m256d)shuf[2], 0b00100000)};
45 datapar::unaligned_store(cols[0], pb + 0 * ldb);
46 if (d < 2) [[unlikely]] return;
47 cols[1] = simd{_mm256_permute2f128_pd((__m256d)shuf[1], (__m256d)shuf[3], 0b00100000)};
48 datapar::unaligned_store(cols[1], pb + 1 * ldb);
49 if (d < 3) [[unlikely]] return;
50 cols[2] = simd{_mm256_permute2f128_pd((__m256d)shuf[0], (__m256d)shuf[2], 0b00110001)};
51 datapar::unaligned_store(cols[2], pb + 2 * ldb);
52 if (d < 4) [[unlikely]] return;
53 cols[3] = simd{_mm256_permute2f128_pd((__m256d)shuf[1], (__m256d)shuf[3], 0b00110001)};
54 datapar::unaligned_store(cols[3], pb + 3 * ldb);
55 // clang-format on
56}
57#endif
58
59/// Transposes the @p R × @p C matrix at @p pa with leading dimension @p lda, writing the result to
60/// @p pb with leading dimension @p ldb.
61template <index_t R, index_t C, class T>
62[[gnu::always_inline]]
63inline void transpose(const T *pa, index_t lda, T *pb, index_t ldb) {
64 transpose_dyn<R, C>(pa, lda, pb, ldb, R);
65}
66
67/// @}
68
69/// @}
70
71} // namespace batmat::ops
#define BATMAT_ASSUME(x)
Invokes undefined behavior if the expression x does not evaluate to true.
Definition assume.hpp:17
void transpose(const T *pa, index_t lda, T *pb, index_t ldb)
Transposes the R × C matrix at pa with leading dimension lda, writing the result to pb with leading d...
Definition transpose.hpp:63
void transpose_dyn(const T *pa, index_t lda, T *pb, index_t ldb, index_t d=R)
Transposes the R × C matrix at pa with leading dimension lda, writing the result to pb with leading d...
Definition transpose.hpp:20
V unaligned_load(const typename V::value_type *p)
Definition simd.hpp:106
void unaligned_store(V v, typename V::value_type *p)
Definition simd.hpp:116
simd< Tp, deduced_abi< Tp, Np > > deduced_simd
Definition simd.hpp:103
stdx::simd< Tp, Abi > simd
Definition simd.hpp:99
constexpr auto cols(const MatrixView< T, I, S, O > &v)
Definition simdify.hpp:20
#define BATMAT_FULLY_UNROLLED_FOR(...)
Definition unroll.h:27