9#define UNROLL_FOR(...) BATMAT_FULLY_UNROLLED_FOR (__VA_ARGS__)
18template <
class T,
class Abi, KernelConfig Conf, index_t RowsReg, StorageOrder OA, StorageOrder OD>
21 const index_t k)
noexcept {
31 auto A1r = [&A1_reg](index_t r, index_t c) -> simd & {
32 return A1_reg[c * (2 *
RowsReg - 1 - c) / 2 + r];
36 A1r(ii, jj) = A1_cached.load(ii, jj);
56 A1r(jj, jj) = simd{1} / A1r(jj, jj);
60 A1r(ii, jj) *= -A1r(jj, jj);
65 A1r(ii, jj) += A1r(ii, ll) * A1r(ll, jj);
66 A1r(ll, jj) *= A1r(ll, ll);
76 D1_cached.store(A1r(i, j), i, j);
79 for (index_t l =
RowsReg; l < k; ++l) {
82 A2r[i] = A_cached.load(l, i);
86 A2r[i] -= A2r[j] * A1r(j, i);
87 D_cached.store(A2r[i], l, i);
96template <
class T,
class Abi, KernelConfig Conf, index_t RowsReg, index_t ColsReg, StorageOrder OD>
97[[gnu::hot, gnu::flatten]]
99 const index_t k)
noexcept {
109 for (index_t l = 0; l < k -
RowsReg; ++l)
111 simd Ail = A1_cached.load(ii, l);
113 simd &Cij = D_reg[ii][jj];
114 simd Blj = B1_cached.load(l, jj);
121 simd Ail = A1_cached.load(ii, k -
RowsReg + ll);
123 D_reg[ii][jj] += Ail * B1_cached.load(k -
RowsReg + ll, jj);
128 D.store(D_reg[ii][jj], k -
RowsReg + ii, jj);
131template <
class T,
class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OD>
137 const index_t I = A.rows();
150 return trtri_microkernel[I - 1](A_, D_, I);
159 [&](index_t j,
auto nj) {
160 const auto jp = j + nj;
161 const auto Ajj = A_.
block(j, j);
162 const auto Djj = D_.
block(j, j);
163 const auto Dj = D_.
block(jp, j);
165 trtri_microkernel[nj - 1](Ajj, Djj, I - j);
169 [&](index_t i,
auto ni) {
171 const auto Dpi = D_.
block(i, jp);
#define BATMAT_ASSUME(x)
Invokes undefined behavior if the expression x does not evaluate to true.
void foreach_chunked_merged(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, LoopDir dir=LoopDir::Forward)
Iterate over the range [i_begin, i_end) in chunks of size chunk_size, calling func_chunk for each chu...
stdx::simd< Tp, Abi > simd
const constinit auto trmm_lut
constexpr index_t RowsReg
Register block size of the matrix-matrix multiplication micro-kernels.
const constinit auto trtri_copy_lut
void trmm_microkernel(uview< const T, Abi, OD > Dr, uview< T, Abi, OD > D, index_t k) noexcept
void trtri_copy_register(view< const T, Abi, OA > A, view< T, Abi, OD > D) noexcept
constexpr index_t ColsReg
void trtri_copy_microkernel(uview< const T, Abi, OA > A, uview< T, Abi, OD > D, index_t k) noexcept
cached_uview< Order==StorageOrder::ColMajor ? Cols :Rows, T, Abi, Order > with_cached_access(const uview< T, Abi, Order > &o) noexcept
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
Self block(this const Self &self, index_t r, index_t c) noexcept