Classes
struct	KernelConfig

Functions
template<class T, class Abi, KernelConfig Conf, index_t RowsReg, StorageOrder O1, StorageOrder O2>
void	potrf_copy_microkernel (uview< const T, Abi, O1 > A1, uview< const T, Abi, O2 > A2, uview< const T, Abi, O2 > C, uview< T, Abi, O2 > D, T *invD, index_t k1, index_t k2, T regularization) noexcept
template<class T, class Abi, KernelConfig Conf, index_t RowsReg, index_t ColsReg, StorageOrder O1, StorageOrder O2>
void	trsm_copy_microkernel (uview< const T, Abi, O1 > A1, uview< const T, Abi, O1 > B1, uview< const T, Abi, O2 > A2, uview< const T, Abi, O2 > B2, uview< const T, Abi, O2 > L, const T *invL, uview< const T, Abi, O2 > C, uview< T, Abi, O2 > D, index_t k1, index_t k2) noexcept
template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OCD>
void	potrf_copy_register (view< const T, Abi, OA > A, view< const T, Abi, OCD > C, view< T, Abi, OCD > D, T regularization) noexcept

Variables
template<class T, class Abi>
constexpr index_t	ColsReg = RowsReg<T, Abi>
template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OC>
const constinit auto	potrf_copy_lut
template<class T, class Abi, KernelConfig Conf, StorageOrder O1, StorageOrder O2>
const constinit auto	trsm_copy_lut
template<class T, class Abi>
constexpr index_t	RowsReg
	Register block size of the matrix-matrix multiplication micro-kernels.

Class Documentation

◆ batmat::linalg::micro_kernels::potrf::KernelConfig

struct batmat::linalg::micro_kernels::potrf::KernelConfig

Class Members
bool	negate_A = false
MatrixStructure	struc_C = MatrixStructure::LowerTriangular

Function Documentation

◆ potrf_copy_microkernel()

template<class T, class Abi, KernelConfig Conf, index_t RowsReg, StorageOrder O1, StorageOrder O2>

void batmat::linalg::micro_kernels::potrf::potrf_copy_microkernel	(	const uview< const T, Abi, O1 >	A1,
		const uview< const T, Abi, O2 >	A2,
		const uview< const T, Abi, O2 >	C,
		const uview< T, Abi, O2 >	D,
		T *const	invD,
		const index_t	k1,
		const index_t	k2,
		T	regularization )

noexcept

Parameters

A1	RowsReg×k1.
A2	RowsReg×k2.
C	RowsReg×RowsReg.
D	RowsReg×RowsReg.
invD	Inverse diagonal of `D`.
k1	Number of columns in A1.
k2	Number of columns in A2.
regularization	Regularization added to the diagonal of C before factorization.

Definition at line 24 of file potrf.tpp.

◆ trsm_copy_microkernel()

template<class T, class Abi, KernelConfig Conf, index_t RowsReg, index_t ColsReg, StorageOrder O1, StorageOrder O2>

void batmat::linalg::micro_kernels::potrf::trsm_copy_microkernel	(	const uview< const T, Abi, O1 >	A1,
		const uview< const T, Abi, O1 >	B1,
		const uview< const T, Abi, O2 >	A2,
		const uview< const T, Abi, O2 >	B2,
		const uview< const T, Abi, O2 >	L,
		const T *	invL,
		const uview< const T, Abi, O2 >	C,
		const uview< T, Abi, O2 >	D,
		const index_t	k1,
		const index_t	k2 )

noexcept

Parameters

A1	RowsReg×k1.
B1	ColsReg×k1.
A2	RowsReg×k2.
B2	ColsReg×k2.
L	ColsReg×ColsReg.
invL	ColsReg (inverted diagonal of L).
C	RowsReg×ColsReg.
D	RowsReg×ColsReg.
k1	Number of columns in A1 and B1.
k2	Number of columns in A2 and B2.

Definition at line 126 of file potrf.tpp.

◆ potrf_copy_register()

template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OCD>

void batmat::linalg::micro_kernels::potrf::potrf_copy_register	(	view< const T, Abi, OA >	A,
		view< const T, Abi, OCD >	C,
		view< T, Abi, OCD >	D,
		T	regularization )

noexcept

Definition at line 185 of file potrf.tpp.

Variable Documentation

◆ ColsReg

template<class T, class Abi>

index_t batmat::linalg::micro_kernels::potrf::ColsReg = RowsReg<T, Abi>

constexpr

Definition at line 34 of file potrf.hpp.

◆ potrf_copy_lut

template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OC>

const constinit auto batmat::linalg::micro_kernels::potrf::potrf_copy_lut

inlineconstinit

Initial value:

                                       =
make_1d_lut<RowsReg<T, Abi>>([]<index_t Row>(index_constant<Row>) {
    return potrf_copy_microkernel<T, Abi, Conf, Row + 1, OA, OC>;
})

Definition at line 37 of file potrf.hpp.

◆ trsm_copy_lut

template<class T, class Abi, KernelConfig Conf, StorageOrder O1, StorageOrder O2>

const constinit auto batmat::linalg::micro_kernels::potrf::trsm_copy_lut

inlineconstinit

Initial value:

                                      = make_2d_lut<RowsReg<T, Abi>, ColsReg<T, Abi>>(
[]<index_t Row, index_t Col>(index_constant<Row>, index_constant<Col>) {
    return trsm_copy_microkernel<T, Abi, Conf, Row + 1, Col + 1, O1, O2>;
})

Definition at line 43 of file potrf.hpp.

◆ RowsReg

template<class T, class Abi>

index_t batmat::linalg::micro_kernels::gemm::RowsReg

inlineconstexpr

Register block size of the matrix-matrix multiplication micro-kernels.

AVX-512 has 32 vector registers, we use 25 registers for a 5×5 accumulator block of matrix C (leaving some registers for loading A and B):

AVX2 has 16 vector registers, we use 9 registers for a 3×3 accumulator block of matrix C (leaving some registers for loading A and B):

Note: A block size of 4×4 is slightly faster than 3×3 for large matrices, because the even block size results in full cache lines being consumed. For small matrices, 3×3 is faster because it does not spill any registers in the micro-kernels. 2×2 is slower than 3×3 for both small and large matrices (tested using GCC 15.1 on an i7-10750H).

Assumes that the platform has at least 16 vector registers, we use 9 registers for a 3×3 accumulator block of matrix C (leaving some registers for loading A and B):

NEON has 32 vector registers, we use 16 registers for a 4×4 accumulator block of matrix C (leaving plenty of registers for loading A and B):

Definition at line 13 of file avx-512.hpp.

Classes

Functions

Variables

Class Documentation

◆ batmat::linalg::micro_kernels::potrf::KernelConfig

Function Documentation

◆ potrf_copy_microkernel()

◆ trsm_copy_microkernel()

◆ potrf_copy_register()

Variable Documentation

◆ ColsReg

◆ potrf_copy_lut

◆ trsm_copy_lut

◆ RowsReg