batmat 0.0.13
Batched linear algebra routines
Loading...
Searching...
No Matches
batmat::linalg::micro_kernels::potrf Namespace Reference

Classes

struct  KernelConfig

Functions

template<class T, class Abi, KernelConfig Conf, index_t RowsReg, StorageOrder O1, StorageOrder O2>
void potrf_copy_microkernel (uview< const T, Abi, O1 > A1, uview< const T, Abi, O2 > A2, uview< const T, Abi, O2 > C, uview< T, Abi, O2 > D, T *invD, index_t k1, index_t k2, T regularization) noexcept
template<class T, class Abi, KernelConfig Conf, index_t RowsReg, index_t ColsReg, StorageOrder O1, StorageOrder O2>
void trsm_copy_microkernel (uview< const T, Abi, O1 > A1, uview< const T, Abi, O1 > B1, uview< const T, Abi, O2 > A2, uview< const T, Abi, O2 > B2, uview< const T, Abi, O2 > L, const T *invL, uview< const T, Abi, O2 > C, uview< T, Abi, O2 > D, index_t k1, index_t k2) noexcept
template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OCD>
void potrf_copy_register (view< const T, Abi, OA > A, view< const T, Abi, OCD > C, view< T, Abi, OCD > D, T regularization) noexcept

Variables

template<class T, class Abi>
constexpr index_t ColsReg = RowsReg<T, Abi>
template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OC>
const constinit auto potrf_copy_lut
template<class T, class Abi, KernelConfig Conf, StorageOrder O1, StorageOrder O2>
const constinit auto trsm_copy_lut
template<class T, class Abi>
constexpr index_t RowsReg
 Register block size of the matrix-matrix multiplication micro-kernels.

Class Documentation

◆ batmat::linalg::micro_kernels::potrf::KernelConfig

struct batmat::linalg::micro_kernels::potrf::KernelConfig
Class Members
bool negate_A = false
MatrixStructure struc_C = MatrixStructure::LowerTriangular

Function Documentation

◆ potrf_copy_microkernel()

template<class T, class Abi, KernelConfig Conf, index_t RowsReg, StorageOrder O1, StorageOrder O2>
void batmat::linalg::micro_kernels::potrf::potrf_copy_microkernel ( const uview< const T, Abi, O1 > A1,
const uview< const T, Abi, O2 > A2,
const uview< const T, Abi, O2 > C,
const uview< T, Abi, O2 > D,
T *const invD,
const index_t k1,
const index_t k2,
T regularization )
noexcept
Parameters
A1RowsReg×k1.
A2RowsReg×k2.
CRowsReg×RowsReg.
DRowsReg×RowsReg.
invDInverse diagonal of D.
k1Number of columns in A1.
k2Number of columns in A2.
regularizationRegularization added to the diagonal of C before factorization.

Definition at line 24 of file potrf.tpp.

◆ trsm_copy_microkernel()

template<class T, class Abi, KernelConfig Conf, index_t RowsReg, index_t ColsReg, StorageOrder O1, StorageOrder O2>
void batmat::linalg::micro_kernels::potrf::trsm_copy_microkernel ( const uview< const T, Abi, O1 > A1,
const uview< const T, Abi, O1 > B1,
const uview< const T, Abi, O2 > A2,
const uview< const T, Abi, O2 > B2,
const uview< const T, Abi, O2 > L,
const T * invL,
const uview< const T, Abi, O2 > C,
const uview< T, Abi, O2 > D,
const index_t k1,
const index_t k2 )
noexcept
Parameters
A1RowsReg×k1.
B1ColsReg×k1.
A2RowsReg×k2.
B2ColsReg×k2.
LColsReg×ColsReg.
invLColsReg (inverted diagonal of L).
CRowsReg×ColsReg.
DRowsReg×ColsReg.
k1Number of columns in A1 and B1.
k2Number of columns in A2 and B2.

Definition at line 126 of file potrf.tpp.

◆ potrf_copy_register()

template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OCD>
void batmat::linalg::micro_kernels::potrf::potrf_copy_register ( view< const T, Abi, OA > A,
view< const T, Abi, OCD > C,
view< T, Abi, OCD > D,
T regularization )
noexcept

Definition at line 185 of file potrf.tpp.

Variable Documentation

◆ ColsReg

template<class T, class Abi>
index_t batmat::linalg::micro_kernels::potrf::ColsReg = RowsReg<T, Abi>
constexpr

Definition at line 34 of file potrf.hpp.

◆ potrf_copy_lut

template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OC>
const constinit auto batmat::linalg::micro_kernels::potrf::potrf_copy_lut
inlineconstinit
Initial value:
=
})
consteval auto make_1d_lut(F f)
Returns an array of the form:
Definition lut.hpp:39
void potrf_copy_microkernel(uview< const T, Abi, O1 > A1, uview< const T, Abi, O2 > A2, uview< const T, Abi, O2 > C, uview< T, Abi, O2 > D, T *invD, index_t k1, index_t k2, T regularization) noexcept
Definition potrf.tpp:24
std::integral_constant< index_t, I > index_constant
Definition lut.hpp:10

Definition at line 37 of file potrf.hpp.

◆ trsm_copy_lut

template<class T, class Abi, KernelConfig Conf, StorageOrder O1, StorageOrder O2>
const constinit auto batmat::linalg::micro_kernels::potrf::trsm_copy_lut
inlineconstinit
Initial value:
[]<index_t Row, index_t Col>(index_constant<Row>, index_constant<Col>) {
})
consteval auto make_2d_lut(F f)
Returns a 2D array of the form:
Definition lut.hpp:25
void trsm_copy_microkernel(uview< const T, Abi, O1 > A1, uview< const T, Abi, O1 > B1, uview< const T, Abi, O2 > A2, uview< const T, Abi, O2 > B2, uview< const T, Abi, O2 > L, const T *invL, uview< const T, Abi, O2 > C, uview< T, Abi, O2 > D, index_t k1, index_t k2) noexcept
Definition potrf.tpp:126

Definition at line 43 of file potrf.hpp.

◆ RowsReg

template<class T, class Abi>
index_t batmat::linalg::micro_kernels::gemm::RowsReg
inlineconstexpr

Register block size of the matrix-matrix multiplication micro-kernels.

AVX-512 has 32 vector registers, we use 25 registers for a 5×5 accumulator block of matrix C (leaving some registers for loading A and B):

AVX2 has 16 vector registers, we use 9 registers for a 3×3 accumulator block of matrix C (leaving some registers for loading A and B):

Note
A block size of 4×4 is slightly faster than 3×3 for large matrices, because the even block size results in full cache lines being consumed. For small matrices, 3×3 is faster because it does not spill any registers in the micro-kernels. 2×2 is slower than 3×3 for both small and large matrices (tested using GCC 15.1 on an i7-10750H).

Assumes that the platform has at least 16 vector registers, we use 9 registers for a 3×3 accumulator block of matrix C (leaving some registers for loading A and B):

NEON has 32 vector registers, we use 16 registers for a 4×4 accumulator block of matrix C (leaving plenty of registers for loading A and B):

Definition at line 13 of file avx-512.hpp.