9#define UNROLL_FOR(...) BATMAT_FULLY_UNROLLED_FOR (__VA_ARGS__)
16[[gnu::hot, gnu::flatten]]
void
32 for (index_t i = 0; i < k; ++i) {
33 simd Bi = B.load(i, 0);
34 simd Di = D.load(i, 0);
37 simd Ail = A_cached.load(i, l);
42 Conf.negate ? (Di -= Ail * Bl[l]) : (Di += Ail * Bl[l]);
48 simd vl = D.load(l + l0, 0);
49 Conf.negate ? (vl -= accum[l]) : (vl += accum[l]);
50 D.store(vl, l + l0, 0);
55template <
class T,
class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OD>
61 const index_t I = D.rows(), J = D.cols(), K = A.cols();
78 return microkernel[I - 1](A_, B_, D_, 0, K);
82 microkernel[nl - 1](Al, B_, D_, l, A.rows());
#define BATMAT_ASSUME(x)
Invokes undefined behavior if the expression x does not evaluate to true.
datapar::simd< F, Abi > shiftl(datapar::simd< F, Abi > x)
Shift the elements of x to the left by S positions, shifting in zeros.
datapar::simd< F, Abi > shiftr(datapar::simd< F, Abi > x)
Shift the elements of x to the right by S positions, shifting in zeros.
void foreach_chunked_merged(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, LoopDir dir=LoopDir::Forward)
Iterate over the range [i_begin, i_end) in chunks of size chunk_size, calling func_chunk for each chu...
stdx::simd< Tp, Abi > simd
const constinit auto syomv_lut
constexpr index_t RowsReg
Register block size of the matrix-matrix multiplication micro-kernels.
void syomv_microkernel(uview< const T, Abi, OA > A, uview< const T, Abi, OB > B, uview< T, Abi, OD > D, index_t l0, index_t k) noexcept
Symmetric off-diagonal block multiply. Single register block.
void syomv_register(view< const T, Abi, OA > A, view< const T, Abi, OB > B, view< T, Abi, OD > D) noexcept
Generalized matrix multiplication D = C ± A⁽ᵀ⁾ B⁽ᵀ⁾. Using register blocking.
cached_uview< Order==StorageOrder::ColMajor ? Cols :Rows, T, Abi, Order > with_cached_access(const uview< T, Abi, Order > &o) noexcept
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
Self middle_cols(this const Self &self, index_t c) noexcept