Namespaces
namespace	detail

Classes
struct	KernelConfig

Functions
template<class T, class Abi, KernelConfig Conf, index_t RowsReg, index_t ColsReg, StorageOrder OA, StorageOrder OB, StorageOrder OC, StorageOrder OD>
void	gemm_copy_microkernel (const uview< const T, Abi, OA > A, const uview< const T, Abi, OB > B, const std::optional< uview< const T, Abi, OC > > C, const uview< T, Abi, OD > D, const index_t k) noexcept
	Generalized matrix multiplication D = C ± A⁽ᵀ⁾ B⁽ᵀ⁾. Single register block.
template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OC, StorageOrder OD>
void	gemm_copy_register (const view< const T, Abi, OA > A, const view< const T, Abi, OB > B, const std::optional< view< const T, Abi, OC > > C, const view< T, Abi, OD > D) noexcept
	Generalized matrix multiplication D = C ± A⁽ᵀ⁾ B⁽ᵀ⁾. Using register blocking.

Variables
template<class T, class Abi>
constexpr index_t	ColsReg = RowsReg<T, Abi>
template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OC, StorageOrder OD>
const constinit decltype(detail::gemm_copy_lut< T, Abi, Conf, OA, OB, OC, OD >)	gemm_copy_lut = detail::gemm_copy_lut<T, Abi, Conf, OA, OB, OC, OD>
template<MatrixStructure Struc>
constexpr auto	first_column
template<index_t ColsReg, MatrixStructure Struc>
constexpr auto	last_column
template<class T, class Abi>
constexpr index_t	RowsReg = 5
	Register block size of the matrix-matrix multiplication micro-kernels.
template<class T, class Abi> requires (datapar::simd_size<T, Abi>::value * sizeof(T) > 32)
constexpr index_t	RowsReg< T, Abi > = 3

Class Documentation

◆ batmat::linalg::micro_kernels::gemm::KernelConfig

struct batmat::linalg::micro_kernels::gemm::KernelConfig

Class Members
bool	negate = false
int	shift_A = 0
int	rotate_B = 0
int	rotate_C = 0
int	rotate_D = rotate_C
int	mask_D = rotate_D
MatrixStructure	struc_A = MatrixStructure::General
MatrixStructure	struc_B = MatrixStructure::General
MatrixStructure	struc_C = MatrixStructure::General

Function Documentation

◆ gemm_copy_microkernel()

template<class T, class Abi, KernelConfig Conf, index_t RowsReg, index_t ColsReg, StorageOrder OA, StorageOrder OB, StorageOrder OC, StorageOrder OD>

void batmat::linalg::micro_kernels::gemm::gemm_copy_microkernel	(	uview< const T, Abi, OA >	A,
		uview< const T, Abi, OB >	B,
		std::optional< uview< const T, Abi, OC > >	C,
		uview< T, Abi, OD >	D,
		index_t	k )

noexcept

Generalized matrix multiplication D = C ± A⁽ᵀ⁾ B⁽ᵀ⁾. Single register block.

Definition at line 36 of file gemm.tpp.

◆ gemm_copy_register()

template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OC, StorageOrder OD>

void batmat::linalg::micro_kernels::gemm::gemm_copy_register	(	view< const T, Abi, OA >	A,
		view< const T, Abi, OB >	B,
		std::optional< view< const T, Abi, OC > >	C,
		view< T, Abi, OD >	D )

noexcept

Generalized matrix multiplication D = C ± A⁽ᵀ⁾ B⁽ᵀ⁾. Using register blocking.

Definition at line 174 of file gemm.tpp.

Variable Documentation

◆ ColsReg

template<class T, class Abi>

index_t batmat::linalg::micro_kernels::gemm::ColsReg = RowsReg<T, Abi>

constexpr

Definition at line 38 of file gemm.hpp.

◆ gemm_copy_lut

template<class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OC, StorageOrder OD>

const constinit decltype(detail::gemm_copy_lut<T, Abi, Conf, OA, OB, OC, OD>) batmat::linalg::micro_kernels::gemm::gemm_copy_lut = detail::gemm_copy_lut<T, Abi, Conf, OA, OB, OC, OD>

externconstinit

Definition at line 20 of file gemm.tpp.

◆ first_column

template<MatrixStructure Struc>

auto batmat::linalg::micro_kernels::gemm::first_column

inlineconstexpr

Initial value:

=

[](index_t row_index) { return Struc == MatrixStructure::UpperTriangular ? row_index : 0; }

batmat::linalg::MatrixStructure::UpperTriangular

@ UpperTriangular

Definition structure.hpp:8

batmat::index_t

int index_t

Definition config.hpp:13

Definition at line 23 of file gemm.tpp.

◆ last_column

template<index_t ColsReg, MatrixStructure Struc>

auto batmat::linalg::micro_kernels::gemm::last_column

inlineconstexpr

Initial value:

                                  = [](index_t row_index) {
    return Struc == MatrixStructure::LowerTriangular ? std::min(row_index, ColsReg - 1)
                                                     : ColsReg - 1;
}

Definition at line 27 of file gemm.tpp.

◆ RowsReg

template<class T, class Abi>

index_t batmat::linalg::micro_kernels::gemm::RowsReg = 5

inlineconstexpr

Register block size of the matrix-matrix multiplication micro-kernels.

AVX-512 has 32 vector registers, we use 25 registers for a 5×5 accumulator block of matrix C (leaving some registers for loading A and B):

AVX2 has 16 vector registers, we use 9 registers for a 3×3 accumulator block of matrix C (leaving some registers for loading A and B):

Note: A block size of 4×4 is slightly faster than 3×3 for large matrices, because the even block size results in full cache lines being consumed. For small matrices, 3×3 is faster because it does not spill any registers in the micro-kernels. 2×2 is slower than 3×3 for both small and large matrices (tested using GCC 15.1 on an i7-10750H).

Assumes that the platform has at least 16 vector registers, we use 9 registers for a 3×3 accumulator block of matrix C (leaving some registers for loading A and B):

NEON has 32 vector registers, we use 16 registers for a 4×4 accumulator block of matrix C (leaving plenty of registers for loading A and B):

Definition at line 13 of file avx-512.hpp.

◆ RowsReg< T, Abi >

template<class T, class Abi>
requires (datapar::simd_size<T, Abi>::value * sizeof(T) > 32)

index_t batmat::linalg::micro_kernels::gemm::RowsReg< T, Abi > = 3

inlineconstexpr

Definition at line 17 of file avx-512.hpp.

Namespaces

Classes

Functions

Variables

Class Documentation

◆ batmat::linalg::micro_kernels::gemm::KernelConfig

Function Documentation

◆ gemm_copy_microkernel()

◆ gemm_copy_register()

Variable Documentation

◆ ColsReg

◆ gemm_copy_lut

◆ first_column

◆ last_column

◆ RowsReg

◆ RowsReg< T, Abi >