11#define UNROLL_FOR(...) BATMAT_FULLY_UNROLLED_FOR (__VA_ARGS__)
15template <
class T,
class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OD>
27[[gnu::hot, gnu::flatten]]
void
42 B_reg[ii][jj] = rotl<Conf.rotate_B>(B_cached.load(ii, jj));
45 const index_t l0 = lower ? 0 :
RowsReg, l1 = lower ? k : k +
RowsReg;
46 for (index_t l = l0; l < l1; ++l)
48 simd Xlj = D_cached.load(l, jj);
50 simd Ail = A.load(ii, l);
51 simd &Bij = B_reg[ii][jj];
56 if constexpr (lower) {
58 simd Aii = simd{1} / A.load(ii, k + ii);
60 simd &Xij = B_reg[ii][jj];
62 simd Ail = A.load(ii, k + ll);
63 simd &Xlj = B_reg[ll][jj];
71 simd Aii = simd{1} / A.load(ii, ii);
73 simd &Xij = B_reg[ii][jj];
75 simd Ail = A.load(ii, ll);
76 simd &Xlj = B_reg[ll][jj];
86 D_cached.store(B_reg[ii][jj], lower ? k + ii : ii, jj);
91template <
class T,
class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OD>
98 const index_t I = A.rows(), K = A.cols(), J = B.cols();
113 if (I <= Rows && J <= Cols)
114 return microkernel[I - 1][J - 1](A_, B_, D_, 0);
117 auto blk = [&] [[gnu::always_inline]] (index_t i, index_t ni, index_t j, index_t nj) {
123 auto Bij = B_.
block(i, j);
125 microkernel[ni - 1][nj - 1](Ai0, Bij, X0j, i + K - I);
127 auto Ai0 = A_.
block(i, i);
128 auto Bij = B_.
block(i, j);
129 auto X0j = D_.
block(i, j);
130 microkernel[ni - 1][nj - 1](Ai0, Bij, X0j, K - i - ni);
133 if constexpr (OD == StorageOrder::ColMajor)
136 [&](index_t j,
auto nj) {
144 [&](index_t i,
auto ni) {
146 0, J, Cols, [&](index_t j,
auto nj) { blk(i, ni, j, nj); },
LoopDir::Forward);
#define BATMAT_ASSUME(x)
Invokes undefined behavior if the expression x does not evaluate to true.
void foreach_chunked_merged(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, LoopDir dir=LoopDir::Forward)
Iterate over the range [i_begin, i_end) in chunks of size chunk_size, calling func_chunk for each chu...
consteval auto make_2d_lut(F f)
Returns a 2D array of the form:
stdx::simd< Tp, Abi > simd
void trsm_copy_microkernel(uview< const T, Abi, OA > A, uview< const T, Abi, OB > B, uview< T, Abi, OD > D, index_t k) noexcept
constexpr index_t RowsReg
Register block size of the matrix-matrix multiplication micro-kernels.
void trsm_copy_register(view< const T, Abi, OA > A, view< const T, Abi, OB > B, view< T, Abi, OD > D) noexcept
Triangular solve D = (A⁽ᵀ⁾)⁻¹ B⁽ᵀ⁾ where A⁽ᵀ⁾ is lower triangular.
constexpr index_t ColsReg
const constinit auto trsm_copy_lut
cached_uview< Order==StorageOrder::ColMajor ? Cols :Rows, T, Abi, Order > with_cached_access(const uview< T, Abi, Order > &o) noexcept
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
std::integral_constant< index_t, I > index_constant
Self block(this const Self &self, index_t r, index_t c) noexcept
Self middle_rows(this const Self &self, index_t r) noexcept
Self middle_cols(this const Self &self, index_t c) noexcept