9#define UNROLL_FOR(...) BATMAT_FULLY_UNROLLED_FOR (__VA_ARGS__)
14template <
class T,
class Abi, KernelConfig Conf, index_t RowsReg, StorageOrder OA>
15[[gnu::hot, gnu::flatten]]
void
31 B_reg[l] = B.load(l, 0);
32 C_reg[l] = C ? C->load(l, 0) : simd{0};
38 auto All = A_cached.load(ll, ll);
39 Conf.negate ? (C_reg[ll] -= All * Blj) : (C_reg[ll] += All * Blj);
41 auto Ail = A_cached.load(ii, ll);
43 Conf.negate ? (C_reg[ii] -= Ail * Blj) : (C_reg[ii] += Ail * Blj);
44 Conf.negate ? (C_reg[ll] -= Ail * Bil) : (C_reg[ll] += Ail * Bil);
48 for (index_t i =
RowsReg; i < k; ++i) {
49 auto Cij = C ? C->load(i, 0) : simd{0};
52 auto Ail = A_cached.load(i, ll);
53 auto Bil = B.load(i, 0);
54 Conf.negate ? (Cij -= Ail * Blj) : (Cij += Ail * Blj);
55 Conf.negate ? (C_reg[ll] -= Ail * Bil) : (C_reg[ll] += Ail * Bil);
60 D.store(C_reg[ll], ll, 0);
64template <
class T,
class Abi, KernelConfig Conf, StorageOrder OA>
70 const index_t I = D.rows();
81 const std::optional<uview<const T, Abi, StorageOrder::ColMajor>> C_ = C;
85 return microkernel[I - 1](A_, B_, C_, D_, I);
86 microkernel[Rows - 1](A_, B_, C_, D_, I);
#define BATMAT_ASSUME(x)
Invokes undefined behavior if the expression x does not evaluate to true.
void foreach_chunked_merged(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, LoopDir dir=LoopDir::Forward)
Iterate over the range [i_begin, i_end) in chunks of size chunk_size, calling func_chunk for each chu...
stdx::simd< Tp, Abi > simd
constexpr index_t RowsReg
void symv_copy_register(view< const T, Abi, OA > A, view< const T, Abi > B, std::optional< view< const T, Abi > > C, view< T, Abi > D) noexcept
Generalized matrix multiplication d = c ± A⁽ᵀ⁾ b. Using register blocking.
void symv_copy_microkernel(uview< const T, Abi, OA > A, uview< const T, Abi, StorageOrder::ColMajor > B, std::optional< uview< const T, Abi, StorageOrder::ColMajor > > C, uview< T, Abi, StorageOrder::ColMajor > D, index_t k) noexcept
Symmetric matrix-vector multiplication d = c ± A b. Single register block.
const constinit auto symv_copy_lut
cached_uview< Order==StorageOrder::ColMajor ? Cols :Rows, T, Abi, Order > with_cached_access(const uview< T, Abi, Order > &o) noexcept
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
Self block(this const Self &self, index_t r, index_t c) noexcept
Self middle_rows(this const Self &self, index_t r) noexcept