10#define UNROLL_FOR(...) BATMAT_FULLY_UNROLLED_FOR (__VA_ARGS__)
14template <
class T,
class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OD>
23[[gnu::hot, gnu::flatten]]
void
39 for (index_t i = 0; i < k; ++i) {
40 simd Bi = B.load(i, 0);
41 simd Di = D.load(i, 0);
44 simd Ail = A_cached.load(i, l);
49 Conf.negate ? (Di -= Ail * Bl[l]) : (Di += Ail * Bl[l]);
55 simd vl = D.load(l + l0, 0);
56 Conf.negate ? (vl -= accum[l]) : (vl += accum[l]);
57 D.store(vl, l + l0, 0);
62template <
class T,
class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OD>
68 const index_t I = D.rows(), J = D.cols(), K = A.cols();
85 return microkernel[I - 1](A_, B_, D_, 0, K);
89 microkernel[nl - 1](Al, B_, D_, l, A.rows());
#define BATMAT_ASSUME(x)
Invokes undefined behavior if the expression x does not evaluate to true.
datapar::simd< F, Abi > rotl(datapar::simd< F, Abi > x)
Rotates the elements of x by s positions to the left.
datapar::simd< F, Abi > rotr(datapar::simd< F, Abi > x)
Rotate the elements of x to the right by S positions.
consteval auto make_1d_lut(F f)
Returns an array of the form:
void foreach_chunked_merged(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, LoopDir dir=LoopDir::Forward)
Iterate over the range [i_begin, i_end) in chunks of size chunk_size, calling func_chunk for each chu...
stdx::simd< Tp, Abi > simd
const constinit auto syomv_lut
constexpr index_t RowsReg
Register block size of the matrix-matrix multiplication micro-kernels.
void syomv_microkernel(uview< const T, Abi, OA > A, uview< const T, Abi, OB > B, uview< T, Abi, OD > D, index_t l0, index_t k) noexcept
Symmetric off-diagonal block multiply. Single register block.
void syomv_register(view< const T, Abi, OA > A, view< const T, Abi, OB > B, view< T, Abi, OD > D) noexcept
Generalized matrix multiplication D = C ± A⁽ᵀ⁾ B⁽ᵀ⁾. Using register blocking.
cached_uview< Order==StorageOrder::ColMajor ? Cols :Rows, T, Abi, Order > with_cached_access(const uview< T, Abi, Order > &o) noexcept
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
std::integral_constant< index_t, I > index_constant
Self middle_cols(this const Self &self, index_t c) noexcept