14#include <guanaqo/trace.hpp>
31 [[maybe_unused]]
static constexpr const auto trace_name =
37 B.rows() * B.cols() * B.depth());
38 const auto I = B.rows(), J = B.cols();
39 if (I == 0 || J == 0 || B.depth() == 0)
43 typename types::simd A{a};
44 const index_t JI_adif = max<index_t>(0, J - I), IJ_adif = max<index_t>(0, I - J);
45 if constexpr (OB == StorageOrder::ColMajor)
46 for (
index_t j = 0; j < J; ++j) {
50 types::template aligned_store<Conf.mask>(A, &B(0, i, j));
53 for (
index_t i = 0; i < I; ++i) {
57 types::template aligned_store<Conf.mask>(A, &B(0, i, j));
76 [[maybe_unused]]
static constexpr const auto trace_name =
82 A.rows() * A.cols() * A.depth());
83 assert(A.rows() == B.rows());
84 assert(A.cols() == B.cols());
85 const auto I = A.rows(), J = A.cols();
86 if (I == 0 || J == 0 || A.depth() == 0)
90 const index_t JI_adif = max<index_t>(0, J - I), IJ_adif = max<index_t>(0, I - J);
91 if constexpr (OA == StorageOrder::ColMajor)
92 for (
index_t j = 0; j < J; ++j) {
96 types::template aligned_store<Conf.mask>(
97 rotl<Conf.rotate>(types::aligned_load(&A(0, i, j))), &B(0, i, j));
100 for (
index_t i = 0; i < I; ++i) {
104 types::template aligned_store<Conf.mask>(
105 rotl<Conf.rotate>(types::aligned_load(&A(0, i, j))), &B(0, i, j));
111 requires(std::same_as<Abi, datapar::scalar_abi<T>> && OA == OB &&
115 assert(A.rows() == B.rows());
116 assert(A.cols() == B.cols());
117 if constexpr (Conf.mask != 0)
119 if (A.rows() == 0 || A.cols() == 0 || A.depth() == 0)
122 static_assert(
typename decltype(A)::batch_size_type() == 1);
123 static_assert(
typename decltype(B)::batch_size_type() == 1);
124 if constexpr (OA == StorageOrder::ColMajor)
125 for (
index_t j = 0; j < A.cols(); ++j)
126 std::copy_n(&A(0, 0, j), A.rows(), &B(0, 0, j));
128 for (
index_t i = 0; i < A.rows(); ++i)
129 std::copy_n(&A(0, i, 0), A.cols(), &B(0, i, 0));
134 requires(std::same_as<Abi, datapar::scalar_abi<T>> && OA != OB &&
138 assert(A.rows() == B.rows());
139 assert(A.cols() == B.cols());
140 if constexpr (Conf.mask != 0)
142 if (A.rows() == 0 || A.cols() == 0 || A.depth() == 0)
147 [[maybe_unused]]
static const constinit auto lut =
153 if constexpr (OA == StorageOrder::ColMajor)
156 foreach_chunked_merged(0, A.rows(), R, [&](index_t r, auto nr) {
157 lut[nr - 1][nc - 1](&A(0, r, c), A.outer_stride(), &B(0, r, c), B.outer_stride());
162 foreach_chunked_merged(0, A.cols(), C, [&](index_t c, auto nc) {
163 lut[nc - 1][nr - 1](&A(0, r, c), A.outer_stride(), &B(0, r, c), B.outer_stride());
168template <
class... Opts>
187void copy(VA &&A, VB &&B, Opts... opts) {
194template <MatrixStructure S, simdifiable VA, simdifiable VB, rotate_opt... Opts>
195 requires simdify_compatible<VA, VB>
203template <simdifiable VB>
209template <MatrixStructure S, simdifiable VB>
221template <simdifiable_multi VA, simdifiable_multi VB, rotate_opt... Opts>
222 requires simdify_compatible<VA, VB>
223void copy(VA &&A, VB &&B, Opts... opts) {
225 for (
index_t b = 0; b < A.num_batches(); ++b)
226 copy(A.batch(b), B.batch(b), opts...);
230template <MatrixStructure S, simdifiable_multi VA, simdifiable_multi VB, rotate_opt... Opts>
231 requires simdify_compatible<VA, VB>
239template <simdifiable_multi VB>
241 for (
index_t b = 0; b < B.num_batches(); ++b)
246template <MatrixStructure S, simdifiable_multi VB>
void copy(VA &&A, VB &&B, Opts... opts)
B = A.
constexpr auto make_structured(M &&m)
View with the given structure.
void fill(simdified_value_t< VB > a, VB &&B)
B = a.
datapar::simd< F, Abi > rotl(datapar::simd< F, Abi > x)
Rotates the elements of x by s positions to the left.
void transpose(const T *pa, index_t lda, T *pb, index_t ldb)
Transposes the R × C matrix at pa with leading dimension lda, writing the result to pb with leading d...
datapar::simd< F, Abi > rotr(datapar::simd< F, Abi > x)
Rotate the elements of x to the right by S positions.
void foreach_chunked_merged(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, LoopDir dir=LoopDir::Forward)
Iterate over the range [i_begin, i_end) in chunks of size chunk_size, calling func_chunk for each chu...
consteval auto make_2d_lut(F f)
Returns a 2D array of the form:
#define GUANAQO_TRACE_LINALG(name, gflops)
#define GUANAQO_TRACE_STATIC_STR(s)
void fill(T a, view< T, Abi, OB > B)
constexpr CopyConfig apply_options(CopyConfig conf, Opts...)
void copy(view< const T, Abi, OA > A, view< T, Abi, OB > B)
typename detail::simdified_value< V >::type simdified_value_t
typename detail::simdified_abi< V >::type simdified_abi_t
constexpr bool simdify_compatible
constexpr auto simdify(simdifiable auto &&a) -> simdified_view_t< decltype(a)>
constexpr std::optional< int > get_rotate
constexpr std::optional< int > get_mask
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
constexpr index_t RowsRegTranspose
constexpr index_t ColsRegTranspose
std::integral_constant< index_t, I > index_constant
Light-weight wrapper class used for overload resolution of triangular and symmetric matrices.
#define BATMAT_UNROLLED_IVDEP_FOR(N,...)