14#include <guanaqo/trace.hpp>
31 [[maybe_unused]]
static constexpr const auto trace_name =
37 B.rows() * B.cols() * B.depth());
38 const auto I = B.rows(), J = B.cols();
39 if (I == 0 || J == 0 || B.depth() == 0)
43 typename types::simd A{a};
44 const index_t JI_adif = max<index_t>(0, J - I), IJ_adif = max<index_t>(0, I - J);
45 if constexpr (OB == StorageOrder::ColMajor)
46 for (index_t j = 0; j < J; ++j) {
47 const index_t i0 = Conf.struc ==
LowerTriangular ? max<index_t>(0, j - JI_adif) : 0;
48 const index_t i1 = Conf.struc ==
UpperTriangular ? min(j + 1 + IJ_adif, I) : I;
50 types::template aligned_store<Conf.mask>(A, &B(0, i, j));
53 for (index_t i = 0; i < I; ++i) {
54 const index_t j0 = Conf.struc ==
UpperTriangular ? max<index_t>(0, i - IJ_adif) : 0;
55 const index_t j1 = Conf.struc ==
LowerTriangular ? min(i + 1 + JI_adif, J) : J;
57 types::template aligned_store<Conf.mask>(A, &B(0, i, j));
76 [[maybe_unused]]
static constexpr const auto trace_name =
82 A.rows() * A.cols() * A.depth());
83 assert(A.rows() == B.rows());
84 assert(A.cols() == B.cols());
85 const auto I = A.rows(), J = A.cols();
86 if (I == 0 || J == 0 || A.depth() == 0)
90 const index_t JI_adif = max<index_t>(0, J - I), IJ_adif = max<index_t>(0, I - J);
91 if constexpr (OA == StorageOrder::ColMajor)
92 for (index_t j = 0; j < J; ++j) {
93 const index_t i0 = Conf.struc ==
LowerTriangular ? max<index_t>(0, j - JI_adif) : 0;
94 const index_t i1 = Conf.struc ==
UpperTriangular ? min(j + 1 + IJ_adif, I) : I;
96 types::template aligned_store<Conf.mask>(
97 rotl<Conf.rotate>(types::aligned_load(&A(0, i, j))), &B(0, i, j));
100 for (index_t i = 0; i < I; ++i) {
101 const index_t j0 = Conf.struc ==
UpperTriangular ? max<index_t>(0, i - IJ_adif) : 0;
102 const index_t j1 = Conf.struc ==
LowerTriangular ? min(i + 1 + JI_adif, J) : J;
104 types::template aligned_store<Conf.mask>(
105 rotl<Conf.rotate>(types::aligned_load(&A(0, i, j))), &B(0, i, j));
111 requires(std::same_as<Abi, datapar::scalar_abi<T>> && OA == OB &&
115 assert(A.rows() == B.rows());
116 assert(A.cols() == B.cols());
117 if constexpr (Conf.mask != 0)
119 if (A.rows() == 0 || A.cols() == 0 || A.depth() == 0)
122 static_assert(
typename decltype(A)::batch_size_type() == 1);
123 static_assert(
typename decltype(B)::batch_size_type() == 1);
124 if constexpr (OA == StorageOrder::ColMajor)
125 for (index_t j = 0; j < A.cols(); ++j)
126 std::copy_n(&A(0, 0, j), A.rows(), &B(0, 0, j));
128 for (index_t i = 0; i < A.rows(); ++i)
129 std::copy_n(&A(0, i, 0), A.cols(), &B(0, i, 0));
134 requires(std::same_as<Abi, datapar::scalar_abi<T>> && OA != OB &&
138 assert(A.rows() == B.rows());
139 assert(A.cols() == B.cols());
140 if constexpr (Conf.mask != 0)
142 if (A.rows() == 0 || A.cols() == 0 || A.depth() == 0)
147 [[maybe_unused]]
static const constinit auto lut =
153 if constexpr (OA == StorageOrder::ColMajor)
156 foreach_chunked_merged(0, A.rows(), R, [&](index_t r, auto nr) {
157 lut[nr - 1][nc - 1](&A(0, r, c), A.outer_stride(), &B(0, r, c), B.outer_stride());
162 foreach_chunked_merged(0, A.cols(), C, [&](index_t c, auto nc) {
163 lut[nc - 1][nr - 1](&A(0, r, c), A.outer_stride(), &B(0, r, c), B.outer_stride());
168template <
class... Opts>
187void copy(VA &&A, VB &&B, Opts... opts) {
194template <MatrixStructure S, simdifiable VA, simdifiable VB, rotate_opt... Opts>
195 requires simdify_compatible<VA, VB>
203template <simdifiable VB>
209template <MatrixStructure S, simdifiable VB>
void copy(VA &&A, VB &&B, Opts... opts)
B = A.
void fill(simdified_value_t< VB > a, VB &&B)
B = a.
datapar::simd< F, Abi > rotl(datapar::simd< F, Abi > x)
Rotates the elements of x by s positions to the left.
void transpose(const T *pa, index_t lda, T *pb, index_t ldb)
Transposes the R × C matrix at pa with leading dimension lda, writing the result to pb with leading d...
datapar::simd< F, Abi > rotr(datapar::simd< F, Abi > x)
Rotate the elements of x to the right by S positions.
void foreach_chunked_merged(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, LoopDir dir=LoopDir::Forward)
Iterate over the range [i_begin, i_end) in chunks of size chunk_size, calling func_chunk for each chu...
consteval auto make_2d_lut(F f)
Returns a 2D array of the form:
#define GUANAQO_TRACE_LINALG(name, gflops)
#define GUANAQO_TRACE_STATIC_STR(s)
void fill(T a, view< T, Abi, OB > B)
constexpr CopyConfig apply_options(CopyConfig conf, Opts...)
void copy(view< const T, Abi, OA > A, view< T, Abi, OB > B)
typename detail::simdified_value< V >::type simdified_value_t
typename detail::simdified_abi< V >::type simdified_abi_t
constexpr bool simdify_compatible
constexpr auto simdify(simdifiable auto &&a) -> simdified_view_t< decltype(a)>
constexpr std::optional< int > get_rotate
constexpr std::optional< int > get_mask
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
constexpr index_t RowsRegTranspose
constexpr index_t ColsRegTranspose
std::integral_constant< index_t, I > index_constant
Light-weight wrapper class used for overload resolution of triangular and symmetric matrices.
#define BATMAT_UNROLLED_IVDEP_FOR(N,...)